In [1]:
import fasttext
import fasttext.util
import pandas as pd
import re
from collections import Counter
from tqdm.auto import tqdm
import numpy as np
import multiprocessing
from datetime import datetime

from news_lib.scrape_news import get_db_conn

In [2]:
import seaborn as sns
from sklearn import feature_extraction, linear_model, metrics, ensemble
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
sns.set()

In [3]:
def normalize_text(text):
    """Normalize text to remove all non word, space characters.

    Args:
        text(str): Text to be normalized.

    Returns:
        str: Normalized text.
    """
    text = re.sub('[^0-9a-zA-Z\.]+', ' ', text)
    text = text.lower()
    return text

In [4]:
with get_db_conn() as conn:
    all_articles=list(tqdm(conn.finance.news.aggregate(
        [
            {'$match':{'tags': {'$exists': True}}},
            {'$project': {'text': 0}},
            {'$sample': {'size': 1000000}}
        ],
        allowDiskUse=True
    )))
    df = pd.DataFrame.from_dict(all_articles).set_index('_id')

df['full_text'] = df['title'] + ' ' + df['description'] # + ' ' + df['text']
df['full_text'] = [normalize_text(x) for x in tqdm(df['full_text'])]
df = df.sample(frac=1.0, random_state=0)

KeyboardInterrupt: 

In [10]:
import json
with get_db_conn() as conn:
#      print(conn.finance.news.find_one())
   print(json.dumps(conn.finance.news.distinct("predicted_tags")))

[null, null, "agriculture", "automotive", "aviation", "banks", "bonds", "budget", "chemicals", "commodities", "companies", "consumer-durables", "coronavirus", "current-affairs", "derivatives", "e-commerce", "earnings", "economy", "enviroment", "environment", "export-and-import", "financial-services", "fmcg", "forex", "gold", "gst", "healthcare", "income-tax", "infra", "it-services", "jobs", "legal", "logistics-transport", "marketing", "markets", "media", "metals", "mining", "miscellaneous", "mutual-funds", "oil", "personal-finance", "pharma", "politics", "power", "private-banks", "psu-banks", "rbi", "real-estate", "recommendations", "regulator", "sports", "startups", "taxes", "technology", "telco", "tourism", "world"]


In [13]:
gh=conn.finance.news.aggregate([
    {
        '$match': {
            'predicted_tags': { '$not': {'$size': 0} }
        }
    },
    { '$unwind': "$predicted_tags" },
    {
        '$group': {
            '_id': {'$toLower': '$predicted_tags'},
            'count': { '$sum': 1 }
        }
    },
    {
        '$match': {
            'count': { '$gte': 2 }
        }
    },
    { '$sort' : { 'count' : -1} },
    { '$limit' : 100 }
]);

In [15]:
list(gh)

[{'_id': 'companies', 'count': 685217},
 {'_id': 'current-affairs', 'count': 380088},
 {'_id': 'earnings', 'count': 248517},
 {'_id': 'markets', 'count': 198458},
 {'_id': 'world', 'count': 191115},
 {'_id': 'economy', 'count': 178652},
 {'_id': 'technology', 'count': 141307},
 {'_id': 'politics', 'count': 140066},
 {'_id': 'commodities', 'count': 79697},
 {'_id': 'recommendations', 'count': 78898},
 {'_id': 'automotive', 'count': 68008},
 {'_id': 'banks', 'count': 52624},
 {'_id': 'coronavirus', 'count': 51658},
 {'_id': 'regulator', 'count': 47466},
 {'_id': 'forex', 'count': 45622},
 {'_id': 'agriculture', 'count': 44113},
 {'_id': 'infra', 'count': 39371},
 {'_id': 'startups', 'count': 35194},
 {'_id': 'logistics-transport', 'count': 34776},
 {'_id': 'rbi', 'count': 29837},
 {'_id': 'financial-services', 'count': 29721},
 {'_id': 'oil', 'count': 27022},
 {'_id': 'sports', 'count': 25621},
 {'_id': 'real-estate', 'count': 25109},
 {'_id': 'gold', 'count': 21985},
 {'_id': 'jobs', 'c

# FastText

In [22]:
def _stats_with_example(df, column):
    out = {}
    for article_id, tags in tqdm(zip(df.index, df[column])):
        for tag in tags:
            if tag == '':
                continue

            if tag not in out:
                out[tag] = {
                    'count': 1,
                    'example_url': df.loc[article_id, 'url'],
                    'example_id': article_id
                }
            else:
                out[tag]['count'] += 1
    return out

tag_stats = pd.DataFrame.from_dict(_stats_with_example(df, 'tags'), orient='index')
keyword_stats = pd.DataFrame.from_dict(_stats_with_example(df, 'keywords'), orient='index')
tag_stats = tag_stats.sort_values('count', ascending=False).head(3000)
keyword_stats = keyword_stats.sort_values('count', ascending=False).head(3000)
tag_stats.to_excel('tags_unannotated.xlsx')
keyword_stats.to_excel('keywords_unannotated.xlsx')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [23]:
manual = pd.read_excel('tags.xlsx', index_col=0)
manual = manual['Manual'].dropna().to_dict()
manual = {k: {x.strip().replace(' ', '-') for x in v.split(',')} for k, v in manual.items()}

In [24]:
df['manual'] = [
    set.union(set(), *[manual.get(tag, set()) for tag in tags])
    for tags in df['tags']
]

all_tags = sorted({y for x in df['manual'] for y in x})

In [25]:
df_small = df[df['manual'] != set()]

n_train = int(0.7 * df_small.shape[0])

df_small['fasttext'] = [
    ' '.join(f'__label__{x}' for x in tags)
    for tags in df_small['manual']
]

Y_small = np.array([
    [tag in manual_tags for tag in all_tags]
    for manual_tags in df_small['manual']
])

df_train = df_small.iloc[:n_train]
df_val = df_small.iloc[n_train:]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['fasttext'] = [


In [26]:
text = df_small['fasttext'] + ' ' + df_small['full_text']

open('/tmp/news.train', 'w').write('\n'.join(text.iloc[:n_train]))
open('/tmp/news.valid', 'w').write('\n'.join(text.iloc[n_train:]))

56586589

In [27]:
# ! bash train_fasttext.sh
# model = fasttext.load_model('news_model_cli.ftz')

In [28]:
model = fasttext.train_supervised(
    input="/tmp/news.train", lr=1.0, epoch=2,
    loss='ova'
)
# model.save_model('news_model.bin')
# model.quantize(input='news_model.bin', qnorm=True, retrain=True, epoch=1, cutoff=100000)
# model.save_model('news_model.ftz')

In [29]:
ft_val_preds = model.predict(list(df_val['full_text']), k=4, threshold=0.1)

In [30]:
def ft_pred_to_final(labels, probs):
    # select top 4
    labels = labels[:4]
    probs = probs[:4]

    return [
        x.replace('__label__', '')
        for x, p in zip(labels, probs)
        if p > probs[0] / 2
    ]

In [31]:
Y_val_pred = np.zeros_like(Y_small[n_train:])
for idx in range(Y_val_pred.shape[0]):
    preds = ft_pred_to_final(ft_val_preds[0][idx], ft_val_preds[1][idx])
    for lab in preds:
        lab_idx = all_tags.index(lab)
        Y_val_pred[idx, lab_idx] = 1

In [32]:
print(metrics.classification_report(Y_small[n_train:], Y_val_pred, target_names=all_tags))

                     precision    recall  f1-score   support

        agriculture       0.74      0.69      0.71      7334
         automotive       0.76      0.68      0.72      8717
           aviation       0.73      0.64      0.69      3558
              banks       0.71      0.57      0.63     11238
              bonds       0.50      0.33      0.40      1827
             budget       0.66      0.50      0.57      3108
          chemicals       0.58      0.31      0.40      1216
        commodities       0.79      0.72      0.76     14900
          companies       0.75      0.85      0.80     77166
  consumer-durables       0.58      0.36      0.44       905
        coronavirus       0.78      0.69      0.73      8489
    current-affairs       0.56      0.67      0.61     42784
        derivatives       0.82      0.76      0.79       842
         e-commerce       0.64      0.50      0.56      2581
           earnings       0.93      0.89      0.91     28773
            economy    

  _warn_prf(average, modifier, msg_start, len(result))


# Save in DB

In [69]:
model = fasttext.load_model('news_model.ftz')

In [None]:
with get_db_conn() as conn:
    qry = {
        'text': {'$exists': True},
        'predicted_tags': {'$exists': False},
        'date': {'$gt': datetime(2020, 6, 1)}
    }
    all_articles = list(tqdm(conn.finance.news.find(qry, {'html': 0})))
    df_update = pd.DataFrame.from_dict(all_articles).set_index('_id')

df_update['full_text'] = df_update['title'] + ' ' + df_update['description'] + ' ' + df_update['text']
df_update['full_text'] = [normalize_text(x) for x in tqdm(df_update['full_text'])]

In [None]:
ft_preds = model.predict(list(df_update['full_text']), k=4, threshold=0.1)
df_update['ft_preds'] = [
    ft_pred_to_final(ft_preds[0][idx], ft_preds[1][idx])
    for idx in range(df_update.shape[0])
]

In [None]:
with get_db_conn() as conn:
    for article_id, ft_pred in tqdm(zip(df_update.index, df_update['ft_preds']), total=len(df_update)):
        conn.finance.news.update_one(
            {'_id': article_id},
            {'$set': {'predicted_tags': ft_pred}}
        )

In [None]:
example_full_text = df_update.iloc[0]['full_text']
model.predict(example_full_text, 4, 0.1)

# TFIDF

In [None]:
tfidf = feature_extraction.text.TfidfVectorizer(
    stop_words=stopwords.words("english"), max_features=10000
)
tfidf.fit(df['full_text'].iloc[:100000])

In [None]:
clf_val_pred = clf.predict(X_val)
labels = sorted(df_val['category'].unique())

conf = metrics.confusion_matrix(
    df_val['category'], clf_val_pred, labels=labels, 
    normalize='true'
)

print(metrics.classification_report(df_val['category'], clf_val_pred))

conf = pd.DataFrame(conf, index=labels, columns=labels)
plt.figure(figsize = (10*1.6,10))
sns.heatmap(conf.round(2) * 100, annot=True)

In [None]:
# ftr_names = tfidf.get_feature_names()

# means = np.array(X_train.mean(axis=0))
# coefs = clf.coef_ / (means ** 0.5)

# for idx, class_ in enumerate(clf.classes_):
#     top_coefs = np.argsort(-np.abs(coefs[idx]))
#     print(class_)
#     for coef_idx in top_coefs[:20]:
#         print(coef_idx, ftr_names[coef_idx], round(coefs[idx, coef_idx]))
        
#     print('---')

In [None]:
open('/tmp/news.unsup', 'w').write('\n'.join(df.sort_values('date', ascending=False)['full_text'].iloc[:10000]))

model_emebds = fasttext.train_unsupervised(
    input="/tmp/news.unsup",
    dim=300,
    wordNgrams=2
)

In [None]:
with get_db_conn() as conn:
    qry = {
        'text': {'$exists': True},
        'date': {
            '$gt': datetime(2020, 6, 29),
#             '$lt': datetime(2020, 6, 29),
         }
    }
    all_articles = list(tqdm(conn.finance.news.find(qry, {'html': 0})))
    daily_news = pd.DataFrame.from_dict(all_articles).set_index('_id')

daily_news['full_text'] = daily_news['title'] + ' ' + daily_news['description'] + ' ' + daily_news['text']
daily_news['full_text'] = [normalize_text(x) for x in tqdm(daily_news['full_text'])]

In [None]:
X_daily = np.array([model.get_sentence_vector(x) for x in daily_news['full_text']])
X_daily_tfidf = tfidf.transform(daily_news['full_text'])

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

cos = metrics.pairwise.cosine_distances(X_daily, X_daily)
graph = csr_matrix(cos < 0.05)

n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True)
comps, counts = np.unique(labels, return_counts=True)
for x in comps[counts > 1]:
    print(np.where([labels == x])[1], daily_news.iloc[labels == x]['title'].to_list())
    print('---')


In [None]:
# from sklearn import cluster
# dbscan = cluster.DBSCAN(eps=0.03, metric='cosine', min_samples=2)
# pred = dbscan.fit_predict(X_daily)

In [None]:
# for x in range(pred.max()):
#     print(daily_news.iloc[pred == x]['title'].to_list(), len(pred[pred==x]))
#     print('---')

In [None]:
cos = metrics.pairwise.cosine_distances(X_daily, X_daily)
cos_tfidf = metrics.pairwise.cosine_distances(X_daily_tfidf, X_daily_tfidf)

In [None]:
# sns.distplot(cos[cos < 0.2])

In [None]:
# sns.distplot(np.sort(cos, axis=1)[:, 2])

In [None]:
# for x, y in zip(*np.where(cos < 0.05)):
#     if x != y:
#         print(daily_news.iloc[x].title, daily_news.iloc[y].title)

In [None]:
n = 660#np.random.choice(X_daily.shape[0])
print(n, daily_news.iloc[n].title, daily_news.iloc[n].website)
for i in np.argsort(cos[n])[1:10]:
    print(i, daily_news.iloc[i].title, daily_news.iloc[i].website, cos[n, i], cos_tfidf[n, i])