# Article Labeling & Lexical Fields Finding

In [12]:
import pandas as pd
import numpy as np
import sys
import re
from tqdm import tqdm
import pickle
import copy
import json

## Add cprofile for evaulation of a function's speed

In [19]:
import cProfile,pstats, io
def profile(fct):
    """ a decorator for the function 
        use by writing @profile before any function that needs evaluation"""
    def inner(*args,**kwargs):
        pr = cProfile.Profile()
        pr.enable()
        retval = fct(*args,**kwargs)
        s=i0.StringIO()
        sortBy = 'cumulative'
        ps = pstats.Stats(pr,stream = s).sort_stats(sortBy)
        ps.print_stats()
        print (s.getvalue())
        return retval

## Download Unlabelled articles

In [20]:
import json

raw_json_data = []
with open('./data/20200420_20200714_business_articles.json') as f:
    for line in f:
        raw_json_data.append(json.loads(line))

In [21]:
print ("data type",type (raw_json_data))
print ("json",type (raw_json_data[0]))
print ("keys",raw_json_data[0].keys())
print ("length", len(raw_json_data))
#print (raw_json_data[0])

data type <class 'list'>
json <class 'dict'>
keys dict_keys(['published', 'link', 'message', 'Feed', 'title', '@version', 'author', '@timestamp', 'full-text', 'type'])
length 416307


## Fetching Company Names & Related Names(49companies)

In [1]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/relevant_words/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].lower() for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies[name] = list(df_tmp["related_name"])
print (len(dict_companies.keys()), dict_companies.keys())
print (dict_companies["21st century fox"])

NameError: name 'pd' is not defined

## Fetching Company Names & Related Names(49 companies) unlowered

In [None]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/relevant_words/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].replace(" ", "_") for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies_unlowered = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies_unlowered[name] = list(df_tmp["related_name"])
print (len(dict_companies_unlowered.keys()), dict_companies_unlowered.keys())
#print (dict_companies_unlowered['21st_Century_Fox'])

## Extracting url, title & full_text of each article:

In [116]:
urls = list()
plain_texts = list()
titles = list()
labels = list()

min_article_size = 2000
for article in raw_json_data:
    plain_text = article.get('full-text')
    title = article.get('title')
    url = article.get('link')
    if (plain_text and "Article `download()` failed" != plain_text[:27] and "Please enable cookies" != plain_text[:21] and len(plain_text)>min_article_size):
        plain_texts.append(plain_text)
        urls.append(url)
        titles.append(title)
        labels.append(list())
       

## Build DataFrame with extacted data

In [117]:
#Statistics
# 358192 removing "Article `download()` failed" 
# 340987 removing "Article `download()` failed" and "Please enable cookies"
# 215039 removing "Article `download()` failed" and "Please enable cookies" and size<min_article_size = 2000
data = np.array([urls,titles, plain_texts, labels]).T
columns=["url", "title", "plain_text", "label"]
df_articles = pd.DataFrame(data=data, columns=columns)

  """


In [118]:
df_articles.tail()

Unnamed: 0,url,title,plain_text,label
215034,http://rssfeeds.usatoday.com/~/t/0/0/usatodayc...,Michigan partygoers test positive for COVID-19...,Michigan partygoers test positive for COVID-19...,[]
215035,https://www.washingtontimes.com/news/2020/jul/...,Coast Guard officials decline to testify on ra...,"NEW LONDON, Conn. (AP) - A planned congression...",[]
215036,https://www.denverpost.com/2020/07/08/united-a...,"United Airlines will slash nearly 36,000 jobs ...",United Airlines plans to furlough as many as 3...,[]
215037,https://www.washingtontimes.com/news/2020/jul/...,The Latest: Pence says CDC will issue guidance...,WASHINGTON - Vice President Mike Pence says th...,[]
215038,https://www.washingtontimes.com/news/2020/jul/...,US rejects nearly all Chinese claims in South...,WASHINGTON (AP) - The Trump administration esc...,[]


# Building df_clean - Cleaning full_text of articles (original way)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

In [13]:
# Remove every non-letter/number character
#n_articles = 10000
#df_cleaned = df_articles.head(n_articles).copy(deep= True)
df_cleaned = df_articles.copy(deep= True)
for index, row in df_cleaned.iterrows():
    row["plain_text"] = row["plain_text"].lower()
    row["plain_text"]= re.sub(r'\s+', ' ', row["plain_text"])
    #[:punct:], ,[^0-9], [^a-z]
    #row["plain_text"] = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', row["plain_text"])
    row["plain_text"] = re.sub("[^a-z0-9]", ' ', row["plain_text"])
    #row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0][:100]

'eliminated masterchef contestant harry foster has hit back at unfair criticism against judge melissa'

## Find Stop Words & Removing them from plain text

In [25]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
# Remove all stop words from plain text
for index, row in df_cleaned.iterrows():
    for stop_word in stop_words:
        row["plain_text"] = re.sub(' '+stop_word+' ', ' ', row["plain_text"])
    row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0]

## Labeling Articles with Company Names 
### Check if Articles has Companies names

In [None]:
for index, row in df_cleaned.iterrows(): # initialize labels
    row['label'] = list()
company_names = dict_companies.keys()   
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["plain_text"]:
            row['label'].append(company)
        else:
            for related_name in dict_companies[company]:
                if related_name in row["plain_text"]:
                    row['label'].append(company)
                    break
df_cleaned["label"].head()

In [108]:
# Getting data from csv
PATH = "./data/"
file = "cleaned_articles_200k"
df_cleaned = pd.read_csv(PATH + file + ".csv") 

In [109]:
# Reformating necessary after loading - send only once or will break
for index, row in df_cleaned.iterrows():
    if row["label"]=="[]":
        row["label"] = list()
    else:
        #print (type(row["label"]))
        row["label"] = row["label"].strip("']['").split("', '") 

In [110]:
print (df_cleaned["label"][2][1])

nvidia


In [119]:
df_cleaned.tail()

Unnamed: 0,url,title,plain_text,label
215034,http://rssfeeds.usatoday.com/~/t/0/0/usatodayc...,Michigan partygoers test positive for COVID-19...,michigan partygoers test positive covid 19 jul...,[]
215035,https://www.washingtontimes.com/news/2020/jul/...,Coast Guard officials decline to testify on ra...,new london conn ap planned congressional heari...,"[autodesk, cisco]"
215036,https://www.denverpost.com/2020/07/08/united-a...,"United Airlines will slash nearly 36,000 jobs ...",united airlines plans furlough many 35 902 u e...,[]
215037,https://www.washingtontimes.com/news/2020/jul/...,The Latest: Pence says CDC will issue guidance...,washington vice president mike pence says cent...,[autodesk]
215038,https://www.washingtontimes.com/news/2020/jul/...,US rejects nearly all Chinese claims in South...,washington ap trump administration escalated a...,[autodesk]


### Get number of articles with labels

In [95]:
labeled = 0
for index, row in df_cleaned.iterrows():
    
    if len(row["label"])>0:
        if index ==0:
            print ("sould never print",row["label"],len(row["label"]), row["label"][1])
        if index ==2:
            print ("should Print:",row["label"],len(row["label"]), row["label"][1])
        labeled +=1
print ("There are %d labeled articles in the %d articles of the corpus"%(labeled, len (df_cleaned["label"])))      

should print a list: ['advanced micro devices', 'nvidia'] 2 nvidia
There are 121097 labeled articles in the 215039 articles of the corpus


## Count Number of Articles that each Company is Associated to.

In [96]:
# init
dict_count = {}
for company in company_names: dict_count[company]= 0
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["label"]:
            dict_count[company]+=1

#dict_count          
companies_w_articles = list()
for company in company_names:
    if dict_count[company]>0:
        companies_w_articles.append(company)
print ("there are %d companies with associated articles over the %d total companies"%(len(companies_w_articles),len(company_names)) )
#dict_count

there are 49 companies with associated articles over the 49 total companies


In [97]:
dict_count["apple"]

21649

# POS TAGGING prep for Tf.Idf

In [100]:
# Imports
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pierre/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
nouns = ["NN","NNS","NNP","NNPS"]

df_pos = df_articles.copy(deep= True)
df_pos["label"] = df_cleaned["label"]
for index, row in tqdm(df_pos.iterrows()):
    paragraph = row["plain_text"]
    sent_text = nltk.sent_tokenize(paragraph) # this gives us a list of sentences
    # Now Loop over each sentence and tokenize it separately
    tokenized_paragraph = list()
    for sentence in sent_text:
        tokenized_text = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokenized_text)
        tokenized_paragraph += tagged
    #tokenized_paragraph
    noun_paragraph = list()
    paragraph = ""
    for token in tokenized_paragraph:
        if token[1] in nouns:
            #noun_paragraph.append(re.sub("[^a-z]", '', token[0].lower()))
            paragraph += " "+re.sub("[^a-z]", '', token[0].lower())
    #noun_paragraph  
    row["plain_text"] = paragraph
#48s for 1000
#2h50 for 216000
#start 6pm

96676it [5:44:17, 25.85it/s] 

In [None]:
# saving data to csv
PATH = "./data/"
file = "pos_articles_200k"
df_pos.to_csv(PATH + file + ".csv",index=False)

In [123]:
# Getting data from csv
PATH = "./data/"
file = "pos_articles_200k"
df_pos = pd.read_csv(PATH + file + ".csv") 

In [124]:
df_pos.head()

Unnamed: 0,url,title,plain_text,label
0,https://www.dailymail.co.uk/tvshowbiz/article-...,MasterChef's Harry Foster hits back at claims ...,masterchef harry foster criticism judge melis...,[]
1,https://www.washingtontimes.com/news/2020/jun/...,"Protest arrests logjam tests NYC legal system,...",new york ap wave arrests new york city death ...,[]
2,https://www.dailymail.co.uk/news/article-83114...,Labour's Anneliese Dodds says she will REFUSE ...,shadow minister today evidence children schoo...,"['advanced micro devices', 'nvidia']"
3,http://feeds.reuters.com/~r/Reuters/worldNews/...,Civil unrest rages in Minneapolis over raciall...,minneapolis reuters rallies way night arson v...,[]
4,https://www.dailymail.co.uk/news/article-82734...,Australia 'beats the cr*p' out of coronavirus ...,australia c p coronavirus states territorie...,['apple']


In [126]:
# Reformating necessary after loading - send only once or will break
for index, row in df_pos.iterrows():
    if row["label"]=="[]":
        row["label"] = list()
    else:
        #print (type(row["label"]))
        row["label"] = row["label"].strip("']['").split("', '") 

In [128]:
df_pos.head()

Unnamed: 0,url,title,plain_text,label
0,https://www.dailymail.co.uk/tvshowbiz/article-...,MasterChef's Harry Foster hits back at claims ...,masterchef harry foster criticism judge melis...,[]
1,https://www.washingtontimes.com/news/2020/jun/...,"Protest arrests logjam tests NYC legal system,...",new york ap wave arrests new york city death ...,[]
2,https://www.dailymail.co.uk/news/article-83114...,Labour's Anneliese Dodds says she will REFUSE ...,shadow minister today evidence children schoo...,"[advanced micro devices, nvidia]"
3,http://feeds.reuters.com/~r/Reuters/worldNews/...,Civil unrest rages in Minneapolis over raciall...,minneapolis reuters rallies way night arson v...,[]
4,https://www.dailymail.co.uk/news/article-82734...,Australia 'beats the cr*p' out of coronavirus ...,australia c p coronavirus states territorie...,[apple]


## Tf.Idf to get top 20 words for each company (that have articles related to them)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [27]:
# Tf.Idf on Companies that have Associated Articles 
relevant_words_tfidf = {}
for company in tqdm(companies_w_articles): # for all companies in companies_w_articles

    #tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,3), binary = True) #sublinear_tf=False
    tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,1))# bilinear doesn't work..
    plain_text_list = list()
    company_article = ""
    for index, row in df_cleaned.iterrows():
        if company in row["label"]:
            company_article = company_article+ " "+ row["plain_text"] # add article to company BIG article
        else:
            plain_text_list.append(row["plain_text"]) # otherwise add to corpus
    
    plain_text_list.insert(0,company_article) # add company article to begging of corpus
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(plain_text_list)

    #Get the tf-idf scores for the words in the company article complication.
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] # discard tf.idf scores for the other texts

    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"],ascending=False).head(40) # Take top 40 words
    
    relevant_words_tfidf[company] = list(zip(list(df.index),list(df["tfidf"])))
    #print (relevant_words_tfidf[company])
    
#100%|██████████| 52/52 [7:31:13<00:00, 520.64s/it]
#100%|██████████| 52/52 [21:46:51<00:00, 1507.90s/it]    
#100%|██████████| 49/49 [23:20:51<00:00, 1715.33s/it] 

100%|██████████| 49/49 [23:20:51<00:00, 1715.33s/it]   


In [13]:
# load dictionary 
PATH = "./relevant_words/english/"
file = "relevant_words_tfidf_nouns"
a_file = open(PATH + file + ".json", "r")
relevant_words_tfidf = json.load(a_file)
#relevant_words_tfidf = dict(relevant_words_tfidf)

In [14]:
for company in relevant_words_tfidf.keys():
    tmp = []
    for element in relevant_words_tfidf[company]:
        tmp.append((element[0],element[1]))
    
    relevant_words_tfidf[company] = tmp

In [15]:
relevant_words_tfidf['apple']

[('apple', 0.2475652136129911),
 ('quarter', 0.23112514445265792),
 ('analyst', 0.19460750540513994),
 ('officer', 0.17494318088381838),
 ('chief', 0.14839686237938338),
 ('executive', 0.138381270476057),
 ('thank', 0.1296944471748394),
 ('question', 0.12294664642888876),
 ('business', 0.12012922318884033),
 ('year', 0.11840247545454582),
 ('president', 0.101373101610324),
 ('operator', 0.10130650107513707),
 ('company', 0.09502443181911849),
 ('us', 0.09401801300102997),
 ('customers', 0.09348121939819964),
 ('people', 0.09190341719760198),
 ('call', 0.08875236217600309),
 ('thanks', 0.08659283250156699),
 ('market', 0.08640074291797184),
 ('cash', 0.08573599811088516),
 ('time', 0.0856133821129408),
 ('growth', 0.08391488004698593),
 ('covid', 0.0827728524084005),
 ('kind', 0.08204520626243436),
 ('revenue', 0.08165670645591498),
 ('sales', 0.07704032303820325),
 ('capital', 0.07505597343383999),
 ('today', 0.07087136603314527),
 ('coronavirus', 0.06969390375154015),
 ('portfolio', 0

In [60]:
#companies_w_articles

# Word Vectorization

In [16]:
# for a give model, we want to get the first 20 words related to a company of: companies_w_articles
# And store everything into a dictionary like for tf.idf
#dict_companies_unlowered
def getTopWords(model, n_words, dict_companies):
    companies = dict_companies.keys()
    
    #Word2Vec.most_similar(positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None)
    relevant_words = {}
    for company in companies:
        #print ("company", company)
        if company in model.wv.vocab:
            relevant_words[company] = model.most_similar(company,topn=n_words)
        else:
            for related_word in dict_companies[company]:
                if related_word in model.wv.vocab:
                    #print ("related_word in model.wv.vocab",related_word in model.wv.vocab)
                    #print (related_word)
                    relevant_words[company] = model.most_similar(related_word,topn=n_words)
                    break
    return relevant_words

In [17]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [None]:
data = [] 
  
# iterate through each article in the file 
data = [word_tokenize(plain_text) for plain_text in df_cleaned["plain_text"]]

In [141]:
PATH = "./data/"
file = "list_tokenized_pt"
with open (PATH +file, 'rb') as fp:
    data = pickle.load(fp)

## Global Matrix factorization to get top 20 words of a company

In [3]:
# load CBOW
PATH = "./data/models/"
file = "CBOW_model_200k"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_cbow = word_vectors

In [32]:
print (model_cbow.most_similar('google'))
vec = model_cbow['king'] - model_cbow['man'] + model_cbow['woman']
print ()
print (model_cbow.most_similar([vec]))
print()
print(model_cbow.similarity('apple', 'man'))

[('googles', 0.7780945301055908), ('apple', 0.7284873723983765), ('googl', 0.704851508140564), ('alphabet', 0.6509729623794556), ('spotify', 0.6320379972457886), ('facebook', 0.6315559148788452), ('microsoft', 0.6176584362983704), ('alphabets', 0.6062008142471313), ('apps', 0.605197548866272), ('stadia', 0.5996901988983154)]

[('king', 0.8107793927192688), ('godfather', 0.5998413562774658), ('thatcher', 0.5920987129211426), ('mitford', 0.5835937261581421), ('altimus', 0.5723137259483337), ('chemouny', 0.5638600587844849), ('atwood', 0.5631056427955627), ('macbeth', 0.5596096515655518), ('enid', 0.5574297308921814), ('antoinette', 0.5557938814163208)]

-0.056162722


## Local context window methods to get top 20 words on a company

In [4]:
# load sg
PATH = "./data/models/"
file = "skip-gram_model"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_sg = word_vectors

In [146]:
len(model_sg.wv.vocab)

255358

In [147]:
print (model_sg.most_similar('apple'))
vec = model_sg['king'] - model_sg['man'] +model_sg['woman']
print ()
print (model_sg.most_similar([vec]))
print()
print(model_sg.similarity('apple', 'man'))

[('aapl', 0.7750053405761719), ('iphone', 0.7719913125038147), ('google', 0.7477635145187378), ('iphones', 0.6974998712539673), ('android', 0.6899538636207581), ('spotify', 0.6810340285301208), ('9to5mac', 0.6795322895050049), ('watchos', 0.679144024848938), ('alphabet', 0.6761608123779297), ('carkey', 0.6720519661903381)]

[('king', 0.8746283054351807), ('coretta', 0.6387332677841187), ('suffragettes', 0.6350522041320801), ('zog', 0.5962809324264526), ('stenhammar', 0.5960381031036377), ('foiling', 0.5941222906112671), ('hietpas', 0.5872020721435547), ('ducruet', 0.5851268768310547), ('luther', 0.5800729990005493), ('khesar', 0.5800399780273438)]

0.21863858


## GloVe to get top 20 words of a company

In [28]:
# GloVe is a global log-bilinear regression model
#from gensim.scripts.glove2word2vec import glove2word2vec
#glove_input_file = 'glove.txt'
#word2vec_output_file = 'word2vec.txt'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [5]:
# load glove
PATH = "./data/models/"
file = "glove_model"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
glove_model = word_vectors

In [58]:
len(glove_model.wv.vocab)

400000

In [26]:
print (glove_model.most_similar('apple'))
vec = glove_model['king'] - glove_model['man'] +glove_model['woman']
print ()
print (glove_model.most_similar([vec]))
print()
print(glove_model.similarity('apple', 'man'))

[('iphone', 0.5987042188644409), ('macintosh', 0.5836331248283386), ('ipod', 0.5761123895645142), ('microsoft', 0.5663833022117615), ('ipad', 0.5628098249435425), ('intel', 0.5457563400268555), ('ibm', 0.5286195278167725), ('google', 0.5282472372055054), ('imac', 0.5072520971298218), ('software', 0.4962984323501587)]

[('king', 0.8065859079360962), ('queen', 0.689616322517395), ('monarch', 0.5575490593910217), ('throne', 0.5565374493598938), ('princess', 0.5518684387207031), ('mother', 0.5142154693603516), ('daughter', 0.5133156776428223), ('kingdom', 0.5025345087051392), ('prince', 0.5017740726470947), ('elizabeth', 0.49080315232276917)]

0.090478964


# Google sg

In [6]:
# load google sg
PATH = "./data/models/"
file = "model_google_sg"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_google_sg = word_vectors

In [57]:
len(model_google_sg.wv.vocab)

3000000

In [51]:
print (model_google_sg.most_similar('apple'))
vec = model_google_sg['king'] - model_google_sg['man'] +model_google_sg['woman']
print ()
print (model_google_sg.most_similar([vec]))
print()
print(model_google_sg.similarity('apple', 'man'))

[('Apple_AAPL', 0.7456986308097839), ('Apple_Nasdaq_AAPL', 0.7300410270690918), ('Apple_NASDAQ_AAPL', 0.7175089120864868), ('Apple_Computer', 0.7145973443984985), ('iPhone', 0.6924266815185547), ('Apple_NSDQ_AAPL', 0.6868604421615601), ('Steve_Jobs', 0.6758421659469604), ('iPad', 0.6580768823623657), ('Apple_nasdaq_AAPL', 0.6444970369338989), ('AAPL_PriceWatch_Alert', 0.6439753174781799)]

[('king', 0.8449392318725586), ('queen', 0.7300517559051514), ('monarch', 0.6454660892486572), ('princess', 0.6156250834465027), ('crown_prince', 0.5818676352500916), ('prince', 0.577711820602417), ('kings', 0.5613664388656616), ('sultan', 0.5376776456832886), ('Queen_Consort', 0.5344247817993164), ('queens', 0.5289887189865112)]

0.11685416


## fasttext & LTSM

In [59]:
# todo

52

# Combine all the models into one & score each word

In [18]:
# retrieve all similarity words
#tf.idf
n_relevant_words = 20

print ("relevant_words_tfidf", len(relevant_words_tfidf), len(relevant_words_tfidf[list(relevant_words_tfidf.keys())[0]]))
#GloVe
relevant_words_glove = getTopWords(glove_model,n_relevant_words,dict_companies)
print ("relevant_words_glove",len(relevant_words_glove), len(relevant_words_glove[list(relevant_words_glove.keys())[0]]))
#CBOW
relevant_words_cbow = getTopWords(model_cbow,n_relevant_words,dict_companies)
print ("relevant_words_cbow",len(relevant_words_cbow), len(relevant_words_cbow[list(relevant_words_cbow.keys())[0]]))
#SG
relevant_words_sg = getTopWords(model_sg,n_relevant_words,dict_companies)
print ("relevant_words_sg",len(relevant_words_sg), len(relevant_words_sg[list(relevant_words_sg.keys())[0]]))
# google SG
relevant_words_google_sg = getTopWords(model_google_sg,n_relevant_words,dict_companies_unlowered)
print ("relevant_words_google_sg",len(relevant_words_google_sg), len(relevant_words_google_sg[list(relevant_words_google_sg.keys())[0]]))

#print(relevant_words_glove.keys())
#print(relevant_words_sg.keys())
#print(relevant_words_cbow.keys())

print ("relevant_words_tfidf",relevant_words_tfidf["apple"])
print("relevant_words_glove",relevant_words_glove["apple"])
print ("relevant_words_cbow",relevant_words_cbow["apple"])
print ("relevant_words_sg",relevant_words_sg["apple"])
print ("relevant_words_google_sg",relevant_words_google_sg["Apple"])


relevant_words_tfidf 49 40


NameError: name 'dict_companies' is not defined

In [126]:
print ("Words in sg but not in glove:")
for relevant_word_sg in relevant_words_sg.keys():
    if relevant_word_sg not in relevant_words_glove.keys():
        print (relevant_word_sg)
print ("Words in glove but not in sg:")
for relevant_word_glove in relevant_words_glove.keys():
    if relevant_word_glove not in relevant_words_sg.keys():
        print (relevant_word_glove)
print("Words in ")

words in sg but not in glove
stericycle
words in glove but not in sg
dish network


In [61]:
print ("relevant_words_sg",relevant_words_google_sg["apple"])

relevant_words_sg [('apples', 0.720359742641449), ('pear', 0.6450697183609009), ('fruit', 0.641014575958252), ('berry', 0.6302294135093689), ('pears', 0.6133961081504822), ('strawberry', 0.6058261394500732), ('peach', 0.6025872230529785), ('potato', 0.5960935354232788), ('grape', 0.5935864448547363), ('blueberry', 0.5866668224334717), ('cherries', 0.5784382224082947), ('mango', 0.5751855373382568), ('apricot', 0.5727777481079102), ('melon', 0.5719985365867615), ('almond', 0.5704830288887024), ('Granny_Smiths', 0.5695333480834961), ('grapes', 0.5692256093025208), ('peaches', 0.5659247040748596), ('pumpkin', 0.5651882886886597), ('apricots', 0.5645568370819092)]


In [200]:
def add_word(dictionary,tup):
    for key in dictionary.keys():
        if tup[0]==key:
            #dictionary[key] = max(tup[1],dictionary[key])
            dictionary[key] +=tup[1]
            
            return
    #l.append(tup)
    dictionary[tup[0]] = tup[1]

def switch_tup(l, tup1, tup2):
    tmp = l[tup1]
    l[tup1] = l[tup2]
    l[tup2] = tmp
    #print ("switch")
def bubble_sort_list_tup(l):
    for i in range (len(l)):
        for j in range (0,len(l)-i-1):
            if l[j][1]<l[j+1][1]:
                switch_tup(l, j, j+1)
    

# 1) Use the words’ similarity scores

In [244]:
# Concat all the words using their similarity scores
# remove two same words and keep highest score OR add both scores
# sort the words
#one_word_companies = relevant_words_glove.keys()
company_names= list(dict_companies.keys())
related_words_concat_1 = {}
for company in company_names: related_words_concat_1[company]= {}
# Creat a unique list of words
for company in related_words_concat_1.keys():
    
    if company in relevant_words_glove.keys():
        for word in relevant_words_glove[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_sg.keys():
        for word in relevant_words_sg[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_cbow.keys():
        for word in relevant_words_cbow[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_tfidf.keys():   
        for word in relevant_words_tfidf[company]:
            add_word(related_words_concat_1[company], word) # convert to tuple
    #Sort the list of words
    related_words_concat_1[company] = {k: v for k, v in sorted(related_words_concat_1[company].items(), key=lambda item: -item[1])}
        
print (related_words_concat_1["apple"])
len(related_words_concat_1.keys())

{'iphone': 2.0682634711265564, 'google': 2.0044981241226196, 'iphones': 1.7639256119728088, 'aapl': 1.40452641248703, 'spotify': 1.3477738499641418, 'android': 1.3440640568733215, 'googles': 1.323325276374817, 'homekit': 1.2814541459083557, 'alphabet': 1.2761602401733398, 'ios': 1.2752171754837036, 'macos': 1.255492925643921, 'airpods': 1.249264419078827, 'microsoft': 1.1840619444847107, 'apples': 1.0988799929618835, '9to5mac': 0.6795322895050049, 'watchos': 0.679144024848938, 'carkey': 0.6720519661903381, 'jailbreak': 0.660087525844574, 'earpods': 0.6569204926490784, 'app': 0.6537714004516602, 'tmsc': 0.65050208568573, 'betwildwood': 0.6501995921134949, 'osx': 0.6468653678894043, 'apps': 0.595978856086731, 'wearables': 0.5949651002883911, 'nvidia': 0.5927913188934326, 'googl': 0.5908902883529663, 'sonos': 0.5848830938339233, 'macintosh': 0.5836331248283386, 'ipod': 0.5761123895645142, 'ipad': 0.5628098249435425, 'intel': 0.5457563400268555, 'ibm': 0.5286195278167725, 'imac': 0.5072520

49

# 2) Score wrt. the number of lists they belong to

In [260]:
# Now lets score the words compared to the number of times they appear in a list

company_names= list(dict_companies.keys())
related_words_concat_2 = {}
for company in company_names: related_words_concat_2[company]= 0

for company in related_words_concat_2.keys():
    #creat list of all the words
    frequency_map = {}
    list_words = list()
    if company in relevant_words_glove.keys():
        list_words = list_words + relevant_words_glove[company]
    if company in relevant_words_sg.keys():
        list_words = list_words +relevant_words_sg[company] 
    if company in relevant_words_cbow.keys():
        list_words = list_words + relevant_words_cbow[company] 
    if company in relevant_words_tfidf.keys():
        list_words = list_words + relevant_words_tfidf[company]

    for word in list_words:
        if word[0] in frequency_map.keys():
            frequency_map[word[0]] +=1
        else:
            frequency_map[word[0]] =1
    frequency_map = {k: v for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    related_words_concat_2[company] = frequency_map   

print (related_words_concat_2["apple"])
len(related_words_concat_2)

{'iphone': 3, 'google': 3, 'iphones': 3, 'microsoft': 2, 'apples': 2, 'aapl': 2, 'android': 2, 'spotify': 2, 'alphabet': 2, 'googles': 2, 'airpods': 2, 'homekit': 2, 'macos': 2, 'ios': 2, 'macintosh': 1, 'ipod': 1, 'ipad': 1, 'intel': 1, 'ibm': 1, 'imac': 1, 'software': 1, 'motorola': 1, 'computer': 1, 'itunes': 1, 'pc': 1, 'mac': 1, 'ipods': 1, 'cherry': 1, 'computers': 1, '9to5mac': 1, 'watchos': 1, 'carkey': 1, 'jailbreak': 1, 'app': 1, 'tmsc': 1, 'betwildwood': 1, 'osx': 1, 'earpods': 1, 'apps': 1, 'wearables': 1, 'nvidia': 1, 'googl': 1, 'sonos': 1, 'apple': 1, 'quarter': 1, 'analyst': 1, 'officer': 1, 'think': 1, 'chief': 1, 'said': 1, 'million': 1, 'year': 1, 'executive': 1, 'thank': 1, 'question': 1, 'business': 1, 'first': 1, 'going': 1, 'us': 1, 'operator': 1, 'new': 1, 'would': 1, 'financial': 1, 'like': 1, 'president': 1, 'well': 1, 'covid': 1, 'call': 1, 'company': 1, 'one': 1, 'customers': 1, 'see': 1, 'also': 1, 'people': 1, 'good': 1, 'market': 1, 'time': 1, 'really': 1

49

# 3) Compute precision score for each word (using the corpus)

In [169]:
# Ranking the words using the "precison" score (with the corpus)
#df_cleaned.tail(2)
#data
company_names= list(dict_companies.keys())
related_words_concat_3 = {}
related_words_concat_3_count = {}
for company in company_names: related_words_concat_3[company]= 0

for company in tqdm(related_words_concat_3.keys()):
    frequency_map = {}
    count_map = {}
    n_artcles = 0
    list_words = list()
    if company in relevant_words_glove.keys():
        list_words = list_words + [word[0] for word in relevant_words_glove[company]]
    if company in relevant_words_sg.keys():
        list_words = list_words +[word[0] for word in relevant_words_sg[company]] 
    if company in relevant_words_cbow.keys():
        list_words = list_words + [word[0] for word in relevant_words_cbow[company]] 
    if company in relevant_words_tfidf.keys():
        list_words = list_words + [word[0] for word in relevant_words_tfidf[company]]
    set_words = set(list_words)
                      
    for index, row in df_cleaned.iterrows():
        if company in row['label']:   # if related to company
            n_artcles +=1
            for word in set_words:
                if word in data[index]:
                    if word in frequency_map.keys():
                        frequency_map[word] +=1
                    else:
                        frequency_map[word] =1
    sorted_frequency_map = {k: v/n_artcles for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    sorted_count_map = {k: v for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    related_words_concat_3[company] = sorted_frequency_map
    related_words_concat_3_count[company] = sorted_count_map

#100%|██████████| 23/23 [13:43<00:00, 35.82s/it] 
#100%|██████████| 43/43 [22:57<00:00, 32.02s/it]
#100%|██████████| 49/49 [29:17<00:00, 35.86s/it]

100%|██████████| 49/49 [29:17<00:00, 35.86s/it]


In [214]:
# load dictionary 
PATH = "./data/"
file = "related_words_precison"
a_file = open(PATH + file +"score"+ ".json", "r")
b_file = open(PATH + file +"count"+ ".json", "r")
related_words_concat_3 = json.load(a_file)
related_words_concat_3_count = json.load(b_file)
#relevant_words_tfidf = dict(relevant_words_tfidf)

In [203]:
print (related_words_concat_3["apple"])
print(related_words_concat_3_count["apple"])
#rwc3_frac = dict(related_words_concat_3)
#rwc3_frac["apple"]

{'said': 0.8869693750288697, 'also': 0.7427132892974271, 'new': 0.727701048547277, 'would': 0.6748117695967482, 'one': 0.6467735230264677, 'first': 0.6188276594761882, 'year': 0.6062173772460622, 'people': 0.6017829922860178, '19': 0.5762390872557623, 'time': 0.5720818513557209, 'covid': 0.5450136264954502, 'like': 0.49577347683495776, 'well': 0.4456556884844566, 'million': 0.41932652778419327, 'company': 0.40722435216407227, 'going': 0.3755369763037554, 'president': 0.3690701649036907, 'see': 0.36514388655365143, 'chief': 0.3454663032934547, 'market': 0.34398817497343986, 'business': 0.32874497667328745, 'apple': 0.32135433507321354, 'us': 0.3015381772830154, 'think': 0.28846597995288464, 'good': 0.2808905723128089, 'financial': 0.2792276779527923, 'executive': 0.2790891034227909, 'quarter': 0.25848768996258487, 'really': 0.23696244630236962, 'officer': 0.23673148875236733, 'call': 0.23405238117234053, 'customers': 0.18952376553189523, 'growth': 0.18841516929188415, 'cash': 0.18795325

# Text Classifier

In [173]:
def clean_plain_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    #[:punct:], ,[^0-9], [^a-z]
    #text = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', text)
    text = re.sub("[^a-z0-9]", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [294]:
def score_company(plain_text, word_list):
    n_words = len(word_list)
    words_in_text = 0
    #print (word_list)
    for token in plain_text.split(" "):
        for word in word_list:
             if word == token:
                words_in_text +=1
                #print(word)
    return words_in_text

In [295]:
import operator
def label_text(plain_text,related_words,n_sig_words=10, min_score = 0.01):
    label_dict = {}
    #print (sig_words_list)
    for company in related_words.keys():
        #print("Company", company)
        sig_words_list = list(related_words[company].keys())[:n_sig_words] + [company]
        score = score_company(plain_text, sig_words_list)
        #print (score)
        if score>min_score:
            label_dict[company]= score
    # Soft_max
    sum_exp = sum([np.exp(v) for v in label_dict.values()])
    label_dict = {k: np.exp(v)/sum_exp for k, v in sorted(label_dict.items(), key=lambda item: -item[1])}
    # Exponomial
    #max_val = max(label_dict.values())
    #label_dict = {k: v/max_val for k, v in sorted(label_dict.items(), key=lambda item: -item[1])}
    return label_dict

In [298]:
plain_text = """
The New York Times said on Monday that it was exiting its partnership with Apple News, as news organizations struggle to compete with large tech companies for readers’ attention and dollars.

Starting on Monday, Times articles were no longer appearing alongside those from other publications in the curated Apple News feed available on Apple devices.

The Times is one of the first media organizations to pull out of Apple News. The Times, which has made adding new subscribers a key business goal, said Apple had given it little in the way of direct relationships with readers and little control over the business. It said it hoped to instead drive readers directly to its own website and mobile app so that it could “fund quality journalism.”

“Core to a healthy model between The Times and the platforms is a direct path for sending those readers back into our environments, where we control the presentation of our report, the relationships with our readers and the nature of our business rules,” Meredith Kopit Levien, chief operating officer, wrote in a memo to employees. “Our relationship with Apple News does not fit within these parameters.”

An Apple spokesman said that The Times “only offered Apple News a few stories a day,” and that the company would continue to provide readers with trusted information from thousands of publishers.

“We are also committed to supporting quality journalism through the proven business models of advertising, subscriptions and commerce,” he said."
"""
plain_text = clean_plain_text(plain_text)
related_words = related_words_concat_1
n_sig_words= 100
min_score = 10 # nbr of sig words in text
#print (plain_text)
label_dict = label_text(plain_text,related_words, n_sig_words, min_score)
label_dict

{'apple': 0.9982787595210628,
 'alphabet': 0.0009103123974033909,
 'amazon': 0.00033488521604819544,
 'intel': 0.00033488521604819544,
 'microsoft': 0.00012319738613638786,
 '21st century fox': 1.667295314677839e-05,
 'cisco': 3.053757890451772e-07,
 'comcast': 3.053757890451772e-07,
 'starbucks': 3.053757890451772e-07,
 'autodesk': 1.1234147462122804e-07,
 'advanced micro devices': 4.1328118904033145e-08,
 'ebay': 4.1328118904033145e-08,
 'netflix': 4.1328118904033145e-08,
 'nvidia': 4.1328118904033145e-08,
 'universal display': 4.1328118904033145e-08,
 'equinix': 1.5203765287082636e-08,
 'adobe': 5.593152677513733e-09,
 'ca technologies': 5.593152677513733e-09,
 'facebook': 5.593152677513733e-09,
 'mckesson': 5.593152677513733e-09,
 'qualcomm': 5.593152677513733e-09,
 'tesla motors': 5.593152677513733e-09,
 'liberty global': 2.0576058813903088e-09,
 'marriott international': 7.569509017969398e-10,
 'mattel': 2.784666747472775e-10,
 'bed bath & beyond': 1.0244216469089824e-10,
 'disco

# ------------------- Annexe Testing -------------------------------