# Article Labeling & Lexical Fields Finding

In [1]:
import pandas as pd
import numpy as np
import sys
import re
from tqdm import tqdm
import pickle
import copy

## Add cprofile for evaulation of a function's speed

In [2]:
import cProfile,pstats, io
def profile(fct):
    """ a decorator for the function 
        use by writing @profile before any function that needs evaluation"""
    def inner(*args,**kwargs):
        pr = cProfile.Profile()
        pr.enable()
        retval = fct(*args,**kwargs)
        s=i0.StringIO()
        sortBy = 'cumulative'
        ps = pstats.Stats(pr,stream = s).sort_stats(sortBy)
        ps.print_stats()
        print (s.getvalue())
        return retval

## Download Unlabelled articles

In [35]:
import json

raw_json_data = []
with open('./data/20200420_20200714_business_articles.json') as f:
    for line in f:
        raw_json_data.append(json.loads(line))

In [36]:
print ("data type",type (raw_json_data))
print ("json",type (raw_json_data[0]))
print ("keys",raw_json_data[0].keys())
print ("length", len(raw_json_data))
#print (raw_json_data[0])

data type <class 'list'>
json <class 'dict'>
keys dict_keys(['published', 'link', 'message', 'Feed', 'title', '@version', 'author', '@timestamp', 'full-text', 'type'])
length 416307


## Fetching Company Names & Related Names(52 companies)

In [5]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/relevant_words/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].lower() for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies[name] = list(df_tmp["related_name"])
print (len(dict_companies.keys()), dict_companies.keys())
print (dict_companies["21st century fox"])

49 dict_keys(['21st century fox', 'activision blizzard', 'adobe', 'advanced micro devices', 'akamai technologies', 'alexion pharmaceuticals', 'amazon', 'american airlines group', 'amgen', 'analog devices', 'apple', 'autodesk', 'automatic data processing', 'baidu', 'bed bath & beyond', 'biogen', 'ca technologies', 'celgene', 'cerner', 'cisco', 'cognizant', 'comcast', 'discovery communications', 'dish network', 'ebay', 'electronic arts', 'equinix', 'expeditors international', 'facebook', 'alphabet', 'intel', 'liberty global', 'liberty interactive', 'linear technology', 'marriott international', 'mattle', 'mattel', 'mckesson', 'microsoft', 'netflix', 'nvidia', 'paypal', 'qualcomm', 'starbucks', 'stericycle', 'tesla motors', 'texas instruments', 'the priceline group', 'universal display'])
['cable television', 'broadcasting', 'record label', 'movie production', 'tv production', 'rupert murdoch', 'james murdoch', 'lachlan murdoch', 'chase carey', 'fox broadcasting company', '20th century fo

## Fetching Company Names & Related Names(49 companies) unlowered

In [6]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/relevant_words/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].replace(" ", "_") for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies_unlowered = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies_unlowered[name] = list(df_tmp["related_name"])
print (len(dict_companies_unlowered.keys()), dict_companies_unlowered.keys())
#print (dict_companies_unlowered['21st_Century_Fox'])

49 dict_keys(['21st_Century_Fox', 'Activision_Blizzard', 'Adobe', 'Advanced_Micro_Devices', 'Akamai_Technologies', 'Alexion_Pharmaceuticals', 'Amazon', 'American_Airlines_Group', 'Amgen', 'Analog_Devices', 'Apple', 'Autodesk', 'Automatic_Data_Processing', 'Baidu', 'Bed_Bath_&_Beyond', 'Biogen', 'CA_Technologies', 'Celgene', 'Cerner', 'Cisco', 'Cognizant', 'Comcast', 'Discovery_Communications', 'Dish_Network', 'EBay', 'Electronic_Arts', 'Equinix', 'Expeditors_International', 'Facebook', 'Alphabet', 'Intel', 'Liberty_Global', 'Liberty_Interactive', 'Linear_Technology', 'Marriott_International', 'Mattle', 'Mattel', 'McKesson', 'Microsoft', 'Netflix', 'NVIDIA', 'Paypal', 'Qualcomm', 'Starbucks', 'Stericycle', 'Tesla_Motors', 'Texas_Instruments', 'The_Priceline_Group', 'Universal_Display'])


## Extracting url, title & full_text of each article:

In [37]:
urls = list()
plain_texts = list()
titles = list()
labels = list()

min_article_size = 2000
for article in raw_json_data:
    plain_text = article.get('full-text')
    title = article.get('title')
    url = article.get('link')
    if (plain_text and "Article `download()` failed" != plain_text[:27] and "Please enable cookies" != plain_text[:21] and len(plain_text)>min_article_size):
        plain_texts.append(plain_text)
        urls.append(url)
        titles.append(title)
        labels.append(list())
       

## Build DataFrame with extacted data

In [38]:
#Statistics
# 358192 removing "Article `download()` failed" 
# 340987 removing "Article `download()` failed" and "Please enable cookies"
# 215039 removing "Article `download()` failed" and "Please enable cookies" and size<min_article_size = 2000
data = np.array([urls,titles, plain_texts, labels]).T
columns=["url", "title", "plain_text", "label"]
df_articles = pd.DataFrame(data=data, columns=columns)

  """


In [39]:
df_articles.tail()

Unnamed: 0,url,title,plain_text,label
215034,http://rssfeeds.usatoday.com/~/t/0/0/usatodayc...,Michigan partygoers test positive for COVID-19...,Michigan partygoers test positive for COVID-19...,[]
215035,https://www.washingtontimes.com/news/2020/jul/...,Coast Guard officials decline to testify on ra...,"NEW LONDON, Conn. (AP) - A planned congression...",[]
215036,https://www.denverpost.com/2020/07/08/united-a...,"United Airlines will slash nearly 36,000 jobs ...",United Airlines plans to furlough as many as 3...,[]
215037,https://www.washingtontimes.com/news/2020/jul/...,The Latest: Pence says CDC will issue guidance...,WASHINGTON - Vice President Mike Pence says th...,[]
215038,https://www.washingtontimes.com/news/2020/jul/...,US rejects nearly all Chinese claims in South...,WASHINGTON (AP) - The Trump administration esc...,[]


# POS TAGGING

In [42]:
# Imports
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pierre/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [44]:
paragraph = df_articles["plain_text"][0]
sent_text = nltk.sent_tokenize(paragraph) # this gives us a list of sentences
# Now Loop over each sentence and tokenize it separately
tokenized_paragraph = list()
for sentence in sent_text:
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    tokenized_paragraph.append(tagged)
    #print(tagged)
tokenized_paragraph[0]

[('Eliminated', 'VBN'),
 ('MasterChef', 'NNP'),
 ('contestant', 'JJ'),
 ('Harry', 'NNP'),
 ('Foster', 'NNP'),
 ('has', 'VBZ'),
 ('hit', 'VBN'),
 ('back', 'RB'),
 ('at', 'IN'),
 ('unfair', 'JJ'),
 ('criticism', 'NN'),
 ('against', 'IN'),
 ('judge', 'NN'),
 ('Melissa', 'NNP'),
 ('Leong', 'NNP'),
 ('.', '.')]

In [None]:
# Representing Tagged Tokens



## Cleaning full_text of articles

In [12]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

In [13]:
# Remove every non-letter/number character
#n_articles = 10000
#df_cleaned = df_articles.head(n_articles).copy(deep= True)
df_cleaned = df_articles.copy(deep= True)
for index, row in df_cleaned.iterrows():
    row["plain_text"] = row["plain_text"].lower()
    row["plain_text"]= re.sub(r'\s+', ' ', row["plain_text"])
    #[:punct:], ,[^0-9], [^a-z]
    #row["plain_text"] = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', row["plain_text"])
    row["plain_text"] = re.sub("[^a-z0-9]", ' ', row["plain_text"])
    #row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0][:100]

'eliminated masterchef contestant harry foster has hit back at unfair criticism against judge melissa'

## Find Stop Words & Removing them from plain text

In [14]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
# Remove all stop words from plain text
for index, row in df_cleaned.iterrows():
    for stop_word in stop_words:
        row["plain_text"] = re.sub(' '+stop_word+' ', ' ', row["plain_text"])
    row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0]

## Labeling Articles with Company Names 
### Check if Articles Talk of Companies

In [29]:
for index, row in df_cleaned.iterrows(): # initialize labels
    row['label'] = list()
company_names = dict_companies.keys()   
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["plain_text"]:
            row['label'].append(company)
        else:
            for related_name in dict_companies[company]:
                if related_name in row["plain_text"]:
                    row['label'].append(company)
                    break
df_cleaned["label"].head()

0                                  []
1                                  []
2    [advanced micro devices, nvidia]
3                                  []
4                             [apple]
Name: label, dtype: object

In [7]:
# Getting data from csv
PATH = "./data/"
file = "cleaned_articles_200k"
df_cleaned = pd.read_csv(PATH + file + ".csv") 

In [30]:
df_cleaned.head()

Unnamed: 0,url,title,plain_text,label
0,https://www.dailymail.co.uk/tvshowbiz/article-...,MasterChef's Harry Foster hits back at claims ...,eliminated masterchef contestant harry foster ...,[]
1,https://www.washingtontimes.com/news/2020/jun/...,"Protest arrests logjam tests NYC legal system,...",new york ap wave arrests new york city protest...,[]
2,https://www.dailymail.co.uk/news/article-83114...,Labour's Anneliese Dodds says she will REFUSE ...,a top shadow minister today said enough eviden...,"[advanced micro devices, nvidia]"
3,http://feeds.reuters.com/~r/Reuters/worldNews/...,Civil unrest rages in Minneapolis over raciall...,minneapolis reuters peaceful rallies gave way ...,[]
4,https://www.dailymail.co.uk/news/article-82734...,Australia 'beats the cr*p' out of coronavirus ...,australia beating c p coronavirus six states t...,[apple]


### Get number of articles with labels

In [31]:
labeled = 0
for index, row in df_cleaned.iterrows():
    if len(row["label"])>0:
        if index ==0:
            print (row["label"],len(row["label"]), row["label"][1] )
        labeled +=1
print ("There are %d labeled articles in the %d articles of the corpus"%(labeled, len (df_cleaned["label"])))      

There are 121097 labeled articles in the 215039 articles of the corpus


## Count Number of Articles that each Company is Associated to.

In [15]:
# init
dict_count = {}
for company in company_names: dict_count[company]= 0
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["label"]:
            dict_count[company]+=1

#dict_count          
companies_w_articles = list()
for company in company_names:
    if dict_count[company]>0:
        companies_w_articles.append(company)
print ("there are %d companies with associated articles over the %d total companies"%(len(companies_w_articles),len(company_names)) )
#dict_count

there are 0 companies with associated articles over the 49 total companies


In [16]:
dict_count["apple"]

KeyError: 'apple'

## Tf.Idf to get top 20 words for each company (that have articles related to them)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [27]:
# Tf.Idf on Companies that have Associated Articles 
relevant_words_tfidf = {}
for company in tqdm(companies_w_articles): # for all companies in companies_w_articles

    #tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,3), binary = True) #sublinear_tf=False
    tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,1))# bilinear doesn't work..
    plain_text_list = list()
    company_article = ""
    for index, row in df_cleaned.iterrows():
        if company in row["label"]:
            company_article = company_article+ " "+ row["plain_text"] # add article to company BIG article
        else:
            plain_text_list.append(row["plain_text"]) # otherwise add to corpus
    
    plain_text_list.insert(0,company_article) # add company article to begging of corpus
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(plain_text_list)

    #Get the tf-idf scores for the words in the company article complication.
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] # discard tf.idf scores for the other texts

    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"],ascending=False).head(40) # Take top 40 words
    
    relevant_words_tfidf[company] = list(zip(list(df.index),list(df["tfidf"])))
    #print (relevant_words_tfidf[company])
    
#100%|██████████| 52/52 [7:31:13<00:00, 520.64s/it]
#100%|██████████| 52/52 [21:46:51<00:00, 1507.90s/it]    
#100%|██████████| 49/49 [23:20:51<00:00, 1715.33s/it] 

100%|██████████| 49/49 [23:20:51<00:00, 1715.33s/it]   


In [None]:
# load dictionary 
PATH = "./data/relevant_words/"
file = "relevant_words_tfidf_200k"
a_file = open(PATH + file + ".json", "r")
relevant_words_tfidf = json.load(a_file)
#relevant_words_tfidf = dict(relevant_words_tfidf)

In [15]:
for company in relevant_words_tfidf.keys():
    tmp = []
    for element in relevant_words_tfidf[company]:
        tmp.append((element[0],element[1]))
    
    relevant_words_tfidf[company] = tmp

In [None]:
relevant_words_tfidf['apple']

In [60]:
#companies_w_articles

# Word Vectorization

In [None]:
# for a give model, we want to get the first 20 words related to a company of: companies_w_articles
# And store everything into a dictionary like for tf.idf
#dict_companies_unlowered
def getTopWords(model, n_words, dict_companies):
    companies = dict_companies.keys()
    
    #Word2Vec.most_similar(positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None)
    relevant_words = {}
    for company in companies:
        #print ("company", company)
        if company in model.wv.vocab:
            relevant_words[company] = model.most_similar(company,topn=n_words)
        else:
            for related_word in dict_companies[company]:
                if related_word in model.wv.vocab:
                    #print ("related_word in model.wv.vocab",related_word in model.wv.vocab)
                    #print (related_word)
                    relevant_words[company] = model.most_similar(related_word,topn=n_words)
                    break
    return relevant_words

In [None]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [None]:
data = [] 
  
# iterate through each article in the file 
data = [word_tokenize(plain_text) for plain_text in df_cleaned["plain_text"]]

In [141]:
PATH = "./data/"
file = "list_tokenized_pt"
with open (PATH +file, 'rb') as fp:
    data = pickle.load(fp)

## Global Matrix factorization to get top 20 words of a company

In [23]:
# LSA
# HAL (Hyper Analogue Language)
# CBOW

# Create CBOW model 
model_cbow = Word2Vec(data, min_count = 1, size = 100, window = 5)


In [24]:
PATH = "./data/models/"
file = "CBOW_model_200k"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_cbow = word_vectors

In [32]:
print (model_cbow.most_similar('google'))
vec = model_cbow['king'] - model_cbow['man'] + model_cbow['woman']
print ()
print (model_cbow.most_similar([vec]))
print()
print(model_cbow.similarity('apple', 'man'))

[('googles', 0.7780945301055908), ('apple', 0.7284873723983765), ('googl', 0.704851508140564), ('alphabet', 0.6509729623794556), ('spotify', 0.6320379972457886), ('facebook', 0.6315559148788452), ('microsoft', 0.6176584362983704), ('alphabets', 0.6062008142471313), ('apps', 0.605197548866272), ('stadia', 0.5996901988983154)]

[('king', 0.8107793927192688), ('godfather', 0.5998413562774658), ('thatcher', 0.5920987129211426), ('mitford', 0.5835937261581421), ('altimus', 0.5723137259483337), ('chemouny', 0.5638600587844849), ('atwood', 0.5631056427955627), ('macbeth', 0.5596096515655518), ('enid', 0.5574297308921814), ('antoinette', 0.5557938814163208)]

-0.056162722


## Local context window methods to get top 20 words on a company

In [142]:
#skip- gram

# Create Skip Gram model 
model_sg = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 4) 
# Start 12:49

In [144]:
# save sg
PATH = "./data/models/"
file = "skip-gram_model"
word_vectors = model_sg.wv
word_vectors.save(PATH + file+".kv")

In [145]:
# load sg
PATH = "./data/models/"
file = "skip-gram_model"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_sg = word_vectors

In [146]:
len(model_sg.wv.vocab)

255358

In [147]:
print (model_sg.most_similar('apple'))
vec = model_sg['king'] - model_sg['man'] +model_sg['woman']
print ()
print (model_sg.most_similar([vec]))
print()
print(model_sg.similarity('apple', 'man'))

[('aapl', 0.7750053405761719), ('iphone', 0.7719913125038147), ('google', 0.7477635145187378), ('iphones', 0.6974998712539673), ('android', 0.6899538636207581), ('spotify', 0.6810340285301208), ('9to5mac', 0.6795322895050049), ('watchos', 0.679144024848938), ('alphabet', 0.6761608123779297), ('carkey', 0.6720519661903381)]

[('king', 0.8746283054351807), ('coretta', 0.6387332677841187), ('suffragettes', 0.6350522041320801), ('zog', 0.5962809324264526), ('stenhammar', 0.5960381031036377), ('foiling', 0.5941222906112671), ('hietpas', 0.5872020721435547), ('ducruet', 0.5851268768310547), ('luther', 0.5800729990005493), ('khesar', 0.5800399780273438)]

0.21863858


## GloVe to get top 20 words of a company

In [28]:
# GloVe is a global log-bilinear regression model
#from gensim.scripts.glove2word2vec import glove2word2vec
#glove_input_file = 'glove.txt'
#word2vec_output_file = 'word2vec.txt'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [56]:
# load glove
PATH = "./data/models/"
file = "glove_model"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
glove_model = word_vectors

In [58]:
len(glove_model.wv.vocab)

400000

In [26]:
print (glove_model.most_similar('apple'))
vec = glove_model['king'] - glove_model['man'] +glove_model['woman']
print ()
print (glove_model.most_similar([vec]))
print()
print(glove_model.similarity('apple', 'man'))

[('iphone', 0.5987042188644409), ('macintosh', 0.5836331248283386), ('ipod', 0.5761123895645142), ('microsoft', 0.5663833022117615), ('ipad', 0.5628098249435425), ('intel', 0.5457563400268555), ('ibm', 0.5286195278167725), ('google', 0.5282472372055054), ('imac', 0.5072520971298218), ('software', 0.4962984323501587)]

[('king', 0.8065859079360962), ('queen', 0.689616322517395), ('monarch', 0.5575490593910217), ('throne', 0.5565374493598938), ('princess', 0.5518684387207031), ('mother', 0.5142154693603516), ('daughter', 0.5133156776428223), ('kingdom', 0.5025345087051392), ('prince', 0.5017740726470947), ('elizabeth', 0.49080315232276917)]

0.090478964


# Google sg

In [12]:
# load google sg
PATH = "./data/models/"
file = "model_google_sg"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_google_sg = word_vectors

NameError: name 'KeyedVectors' is not defined

In [57]:
len(model_google_sg.wv.vocab)

3000000

In [51]:
print (model_google_sg.most_similar('apple'))
vec = model_google_sg['king'] - model_google_sg['man'] +model_google_sg['woman']
print ()
print (model_google_sg.most_similar([vec]))
print()
print(model_google_sg.similarity('apple', 'man'))

[('Apple_AAPL', 0.7456986308097839), ('Apple_Nasdaq_AAPL', 0.7300410270690918), ('Apple_NASDAQ_AAPL', 0.7175089120864868), ('Apple_Computer', 0.7145973443984985), ('iPhone', 0.6924266815185547), ('Apple_NSDQ_AAPL', 0.6868604421615601), ('Steve_Jobs', 0.6758421659469604), ('iPad', 0.6580768823623657), ('Apple_nasdaq_AAPL', 0.6444970369338989), ('AAPL_PriceWatch_Alert', 0.6439753174781799)]

[('king', 0.8449392318725586), ('queen', 0.7300517559051514), ('monarch', 0.6454660892486572), ('princess', 0.6156250834465027), ('crown_prince', 0.5818676352500916), ('prince', 0.577711820602417), ('kings', 0.5613664388656616), ('sultan', 0.5376776456832886), ('Queen_Consort', 0.5344247817993164), ('queens', 0.5289887189865112)]

0.11685416


## fasttext & LTSM

In [59]:
# todo

52

# Combine all the models into one & score each word

In [150]:
# retrieve all similarity words
#tf.idf
n_relevant_words = 20

print ("relevant_words_tfidf", len(relevant_words_tfidf), len(relevant_words_tfidf[list(relevant_words_tfidf.keys())[0]]))
#GloVe
relevant_words_glove = getTopWords(glove_model,n_relevant_words,dict_companies)
print ("relevant_words_glove",len(relevant_words_glove), len(relevant_words_glove[list(relevant_words_glove.keys())[0]]))
#CBOW
relevant_words_cbow = getTopWords(model_cbow,n_relevant_words,dict_companies)
print ("relevant_words_cbow",len(relevant_words_cbow), len(relevant_words_cbow[list(relevant_words_cbow.keys())[0]]))
#SG
relevant_words_sg = getTopWords(model_sg,n_relevant_words,dict_companies)
print ("relevant_words_sg",len(relevant_words_sg), len(relevant_words_sg[list(relevant_words_sg.keys())[0]]))
# google SG
relevant_words_google_sg = getTopWords(model_google_sg,n_relevant_words,dict_companies_unlowered)
print ("relevant_words_google_sg",len(relevant_words_google_sg), len(relevant_words_google_sg[list(relevant_words_google_sg.keys())[0]]))

#print(relevant_words_glove.keys())
#print(relevant_words_sg.keys())
#print(relevant_words_cbow.keys())

print ("relevant_words_tfidf",relevant_words_tfidf["apple"])
print("relevant_words_glove",relevant_words_glove["apple"])
print ("relevant_words_cbow",relevant_words_cbow["apple"])
print ("relevant_words_sg",relevant_words_sg["apple"])
print ("relevant_words_google_sg",relevant_words_google_sg["Apple"])


relevant_words_tfidf 52 40
relevant_words_glove 42 20
relevant_words_cbow 42 20
relevant_words_sg 42 20
relevant_words_google_sg 48 20
relevant_words_tfidf [('apple', 0.19221091642898255), ('quarter', 0.1813757896319136), ('analyst', 0.15118983680084835), ('officer', 0.13039576289958923), ('think', 0.12978398724001325), ('chief', 0.11686635544725837), ('said', 0.11546809934419339), ('million', 0.10470667399352565), ('year', 0.10416846454512013), ('executive', 0.10082235178310875), ('thank', 0.09817567118056518), ('question', 0.09223465868111332), ('business', 0.08953489880929696), ('first', 0.08804188508963462), ('going', 0.08717832601671512), ('us', 0.0832110708925914), ('operator', 0.0831805862027488), ('new', 0.08301255662265287), ('would', 0.08202024828466181), ('financial', 0.07905948282017072), ('like', 0.07856134695086009), ('president', 0.07620409424496975), ('well', 0.07347791690670509), ('covid', 0.07175791753631373), ('call', 0.07142795430315475), ('company', 0.0710529966062

In [126]:
print ("Words in sg but not in glove:")
for relevant_word_sg in relevant_words_sg.keys():
    if relevant_word_sg not in relevant_words_glove.keys():
        print (relevant_word_sg)
print ("Words in glove but not in sg:")
for relevant_word_glove in relevant_words_glove.keys():
    if relevant_word_glove not in relevant_words_sg.keys():
        print (relevant_word_glove)
print("Words in ")

words in sg but not in glove
stericycle
words in glove but not in sg
dish network


In [61]:
print ("relevant_words_sg",relevant_words_google_sg["apple"])

relevant_words_sg [('apples', 0.720359742641449), ('pear', 0.6450697183609009), ('fruit', 0.641014575958252), ('berry', 0.6302294135093689), ('pears', 0.6133961081504822), ('strawberry', 0.6058261394500732), ('peach', 0.6025872230529785), ('potato', 0.5960935354232788), ('grape', 0.5935864448547363), ('blueberry', 0.5866668224334717), ('cherries', 0.5784382224082947), ('mango', 0.5751855373382568), ('apricot', 0.5727777481079102), ('melon', 0.5719985365867615), ('almond', 0.5704830288887024), ('Granny_Smiths', 0.5695333480834961), ('grapes', 0.5692256093025208), ('peaches', 0.5659247040748596), ('pumpkin', 0.5651882886886597), ('apricots', 0.5645568370819092)]


In [200]:
def add_word(dictionary,tup):
    for key in dictionary.keys():
        if tup[0]==key:
            #dictionary[key] = max(tup[1],dictionary[key])
            dictionary[key] +=tup[1]
            
            return
    #l.append(tup)
    dictionary[tup[0]] = tup[1]

def switch_tup(l, tup1, tup2):
    tmp = l[tup1]
    l[tup1] = l[tup2]
    l[tup2] = tmp
    #print ("switch")
def bubble_sort_list_tup(l):
    for i in range (len(l)):
        for j in range (0,len(l)-i-1):
            if l[j][1]<l[j+1][1]:
                switch_tup(l, j, j+1)
    

# 1) Use the words’ similarity scores

In [244]:
# Concat all the words using their similarity scores
# remove two same words and keep highest score OR add both scores
# sort the words
#one_word_companies = relevant_words_glove.keys()
company_names= list(dict_companies.keys())
related_words_concat_1 = {}
for company in company_names: related_words_concat_1[company]= {}
# Creat a unique list of words
for company in related_words_concat_1.keys():
    
    if company in relevant_words_glove.keys():
        for word in relevant_words_glove[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_sg.keys():
        for word in relevant_words_sg[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_cbow.keys():
        for word in relevant_words_cbow[company]:
            add_word(related_words_concat_1[company], word)
    
    if company in relevant_words_tfidf.keys():   
        for word in relevant_words_tfidf[company]:
            add_word(related_words_concat_1[company], word) # convert to tuple
    #Sort the list of words
    related_words_concat_1[company] = {k: v for k, v in sorted(related_words_concat_1[company].items(), key=lambda item: -item[1])}
        
print (related_words_concat_1["apple"])
len(related_words_concat_1.keys())

{'iphone': 2.0682634711265564, 'google': 2.0044981241226196, 'iphones': 1.7639256119728088, 'aapl': 1.40452641248703, 'spotify': 1.3477738499641418, 'android': 1.3440640568733215, 'googles': 1.323325276374817, 'homekit': 1.2814541459083557, 'alphabet': 1.2761602401733398, 'ios': 1.2752171754837036, 'macos': 1.255492925643921, 'airpods': 1.249264419078827, 'microsoft': 1.1840619444847107, 'apples': 1.0988799929618835, '9to5mac': 0.6795322895050049, 'watchos': 0.679144024848938, 'carkey': 0.6720519661903381, 'jailbreak': 0.660087525844574, 'earpods': 0.6569204926490784, 'app': 0.6537714004516602, 'tmsc': 0.65050208568573, 'betwildwood': 0.6501995921134949, 'osx': 0.6468653678894043, 'apps': 0.595978856086731, 'wearables': 0.5949651002883911, 'nvidia': 0.5927913188934326, 'googl': 0.5908902883529663, 'sonos': 0.5848830938339233, 'macintosh': 0.5836331248283386, 'ipod': 0.5761123895645142, 'ipad': 0.5628098249435425, 'intel': 0.5457563400268555, 'ibm': 0.5286195278167725, 'imac': 0.5072520

49

# 2) Score wrt. the number of lists they belong to

In [260]:
# Now lets score the words compared to the number of times they appear in a list

company_names= list(dict_companies.keys())
related_words_concat_2 = {}
for company in company_names: related_words_concat_2[company]= 0

for company in related_words_concat_2.keys():
    #creat list of all the words
    frequency_map = {}
    list_words = list()
    if company in relevant_words_glove.keys():
        list_words = list_words + relevant_words_glove[company]
    if company in relevant_words_sg.keys():
        list_words = list_words +relevant_words_sg[company] 
    if company in relevant_words_cbow.keys():
        list_words = list_words + relevant_words_cbow[company] 
    if company in relevant_words_tfidf.keys():
        list_words = list_words + relevant_words_tfidf[company]

    for word in list_words:
        if word[0] in frequency_map.keys():
            frequency_map[word[0]] +=1
        else:
            frequency_map[word[0]] =1
    frequency_map = {k: v for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    related_words_concat_2[company] = frequency_map   

print (related_words_concat_2["apple"])
len(related_words_concat_2)

{'iphone': 3, 'google': 3, 'iphones': 3, 'microsoft': 2, 'apples': 2, 'aapl': 2, 'android': 2, 'spotify': 2, 'alphabet': 2, 'googles': 2, 'airpods': 2, 'homekit': 2, 'macos': 2, 'ios': 2, 'macintosh': 1, 'ipod': 1, 'ipad': 1, 'intel': 1, 'ibm': 1, 'imac': 1, 'software': 1, 'motorola': 1, 'computer': 1, 'itunes': 1, 'pc': 1, 'mac': 1, 'ipods': 1, 'cherry': 1, 'computers': 1, '9to5mac': 1, 'watchos': 1, 'carkey': 1, 'jailbreak': 1, 'app': 1, 'tmsc': 1, 'betwildwood': 1, 'osx': 1, 'earpods': 1, 'apps': 1, 'wearables': 1, 'nvidia': 1, 'googl': 1, 'sonos': 1, 'apple': 1, 'quarter': 1, 'analyst': 1, 'officer': 1, 'think': 1, 'chief': 1, 'said': 1, 'million': 1, 'year': 1, 'executive': 1, 'thank': 1, 'question': 1, 'business': 1, 'first': 1, 'going': 1, 'us': 1, 'operator': 1, 'new': 1, 'would': 1, 'financial': 1, 'like': 1, 'president': 1, 'well': 1, 'covid': 1, 'call': 1, 'company': 1, 'one': 1, 'customers': 1, 'see': 1, 'also': 1, 'people': 1, 'good': 1, 'market': 1, 'time': 1, 'really': 1

49

# 3) Compute precision score for each word (using the corpus)

In [169]:
# Ranking the words using the "precison" score (with the corpus)
#df_cleaned.tail(2)
#data
company_names= list(dict_companies.keys())
related_words_concat_3 = {}
related_words_concat_3_count = {}
for company in company_names: related_words_concat_3[company]= 0

for company in tqdm(related_words_concat_3.keys()):
    frequency_map = {}
    count_map = {}
    n_artcles = 0
    list_words = list()
    if company in relevant_words_glove.keys():
        list_words = list_words + [word[0] for word in relevant_words_glove[company]]
    if company in relevant_words_sg.keys():
        list_words = list_words +[word[0] for word in relevant_words_sg[company]] 
    if company in relevant_words_cbow.keys():
        list_words = list_words + [word[0] for word in relevant_words_cbow[company]] 
    if company in relevant_words_tfidf.keys():
        list_words = list_words + [word[0] for word in relevant_words_tfidf[company]]
    set_words = set(list_words)
                      
    for index, row in df_cleaned.iterrows():
        if company in row['label']:   # if related to company
            n_artcles +=1
            for word in set_words:
                if word in data[index]:
                    if word in frequency_map.keys():
                        frequency_map[word] +=1
                    else:
                        frequency_map[word] =1
    sorted_frequency_map = {k: v/n_artcles for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    sorted_count_map = {k: v for k, v in sorted(frequency_map.items(), key=lambda item: -item[1])}
    related_words_concat_3[company] = sorted_frequency_map
    related_words_concat_3_count[company] = sorted_count_map

#100%|██████████| 23/23 [13:43<00:00, 35.82s/it] 
#100%|██████████| 43/43 [22:57<00:00, 32.02s/it]
#100%|██████████| 49/49 [29:17<00:00, 35.86s/it]

100%|██████████| 49/49 [29:17<00:00, 35.86s/it]


In [214]:
# load dictionary 
PATH = "./data/"
file = "related_words_precison"
a_file = open(PATH + file +"score"+ ".json", "r")
b_file = open(PATH + file +"count"+ ".json", "r")
related_words_concat_3 = json.load(a_file)
related_words_concat_3_count = json.load(b_file)
#relevant_words_tfidf = dict(relevant_words_tfidf)

In [203]:
print (related_words_concat_3["apple"])
print(related_words_concat_3_count["apple"])
#rwc3_frac = dict(related_words_concat_3)
#rwc3_frac["apple"]

{'said': 0.8869693750288697, 'also': 0.7427132892974271, 'new': 0.727701048547277, 'would': 0.6748117695967482, 'one': 0.6467735230264677, 'first': 0.6188276594761882, 'year': 0.6062173772460622, 'people': 0.6017829922860178, '19': 0.5762390872557623, 'time': 0.5720818513557209, 'covid': 0.5450136264954502, 'like': 0.49577347683495776, 'well': 0.4456556884844566, 'million': 0.41932652778419327, 'company': 0.40722435216407227, 'going': 0.3755369763037554, 'president': 0.3690701649036907, 'see': 0.36514388655365143, 'chief': 0.3454663032934547, 'market': 0.34398817497343986, 'business': 0.32874497667328745, 'apple': 0.32135433507321354, 'us': 0.3015381772830154, 'think': 0.28846597995288464, 'good': 0.2808905723128089, 'financial': 0.2792276779527923, 'executive': 0.2790891034227909, 'quarter': 0.25848768996258487, 'really': 0.23696244630236962, 'officer': 0.23673148875236733, 'call': 0.23405238117234053, 'customers': 0.18952376553189523, 'growth': 0.18841516929188415, 'cash': 0.18795325

# Text Classifier

In [173]:
def clean_plain_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    #[:punct:], ,[^0-9], [^a-z]
    #text = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', text)
    text = re.sub("[^a-z0-9]", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [294]:
def score_company(plain_text, word_list):
    n_words = len(word_list)
    words_in_text = 0
    #print (word_list)
    for token in plain_text.split(" "):
        for word in word_list:
             if word == token:
                words_in_text +=1
                #print(word)
    return words_in_text

In [295]:
import operator
def label_text(plain_text,related_words,n_sig_words=10, min_score = 0.01):
    label_dict = {}
    #print (sig_words_list)
    for company in related_words.keys():
        #print("Company", company)
        sig_words_list = list(related_words[company].keys())[:n_sig_words] + [company]
        score = score_company(plain_text, sig_words_list)
        #print (score)
        if score>min_score:
            label_dict[company]= score
    # Soft_max
    sum_exp = sum([np.exp(v) for v in label_dict.values()])
    label_dict = {k: np.exp(v)/sum_exp for k, v in sorted(label_dict.items(), key=lambda item: -item[1])}
    # Exponomial
    #max_val = max(label_dict.values())
    #label_dict = {k: v/max_val for k, v in sorted(label_dict.items(), key=lambda item: -item[1])}
    return label_dict

In [298]:
plain_text = """
The New York Times said on Monday that it was exiting its partnership with Apple News, as news organizations struggle to compete with large tech companies for readers’ attention and dollars.

Starting on Monday, Times articles were no longer appearing alongside those from other publications in the curated Apple News feed available on Apple devices.

The Times is one of the first media organizations to pull out of Apple News. The Times, which has made adding new subscribers a key business goal, said Apple had given it little in the way of direct relationships with readers and little control over the business. It said it hoped to instead drive readers directly to its own website and mobile app so that it could “fund quality journalism.”

“Core to a healthy model between The Times and the platforms is a direct path for sending those readers back into our environments, where we control the presentation of our report, the relationships with our readers and the nature of our business rules,” Meredith Kopit Levien, chief operating officer, wrote in a memo to employees. “Our relationship with Apple News does not fit within these parameters.”

An Apple spokesman said that The Times “only offered Apple News a few stories a day,” and that the company would continue to provide readers with trusted information from thousands of publishers.

“We are also committed to supporting quality journalism through the proven business models of advertising, subscriptions and commerce,” he said."
"""
plain_text = clean_plain_text(plain_text)
related_words = related_words_concat_1
n_sig_words= 100
min_score = 10 # nbr of sig words in text
#print (plain_text)
label_dict = label_text(plain_text,related_words, n_sig_words, min_score)
label_dict

{'apple': 0.9982787595210628,
 'alphabet': 0.0009103123974033909,
 'amazon': 0.00033488521604819544,
 'intel': 0.00033488521604819544,
 'microsoft': 0.00012319738613638786,
 '21st century fox': 1.667295314677839e-05,
 'cisco': 3.053757890451772e-07,
 'comcast': 3.053757890451772e-07,
 'starbucks': 3.053757890451772e-07,
 'autodesk': 1.1234147462122804e-07,
 'advanced micro devices': 4.1328118904033145e-08,
 'ebay': 4.1328118904033145e-08,
 'netflix': 4.1328118904033145e-08,
 'nvidia': 4.1328118904033145e-08,
 'universal display': 4.1328118904033145e-08,
 'equinix': 1.5203765287082636e-08,
 'adobe': 5.593152677513733e-09,
 'ca technologies': 5.593152677513733e-09,
 'facebook': 5.593152677513733e-09,
 'mckesson': 5.593152677513733e-09,
 'qualcomm': 5.593152677513733e-09,
 'tesla motors': 5.593152677513733e-09,
 'liberty global': 2.0576058813903088e-09,
 'marriott international': 7.569509017969398e-10,
 'mattel': 2.784666747472775e-10,
 'bed bath & beyond': 1.0244216469089824e-10,
 'disco

# ------------------- Annexe Testing -------------------------------

## Python program to generate word vectors using Word2Vec 

In [57]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

In [62]:
# Apply 2 Word2Vec models to articles   

data = [] 
  
# iterate through each article in the file 
for i in clean_articles: 
    temp = [] 
    # tokenize the article into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 
  
# Print results 
print("Cosine similarity between 'australia' " + 
               "and 'melbourne' - CBOW : ", 
    model1.similarity('melbourne', 'australia')) 

print(model1.wv.most_similar('melbourne'))
    

# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 4) 
  
# Print results 
print("Cosine similarity between 'australia' " +
          "and 'melbourne' - Skip Gram : ", 
    model2.similarity('melbourne', 'australia')) 
print(model2.wv.most_similar('melbourne'))


NameError: name 'clean_articles' is not defined

In [59]:
# FOR GENSIN USING CBOW Manipulations

# enumerate data it is trained on
for i, word in enumerate(model1.wv.vocab):
    if i == 5:
        break
    print(word)

NameError: name 'model1' is not defined

In [60]:
len(data)
# Show frequencies
#print("Original List : ",data)
data_flat = []
for line in data:
    for word in line:
        data_flat.append(word)


ctr = collections.Counter(data_flat)
#print("Frequency of the elements in the List : ",ctr)
ctr["the"] # count of word "the"

0

## Using tf.itf

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Vectorizer
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(clean_articles)

#print(vectorizer.get_feature_names()[:10])
#print(X.shape)
#print(vectorizer.get_stop_words())
#print(vectorizer.get_params(deep=True))

n_articles, n_distinct_words = X.shape
print(n_articles, n_distinct_words)

collect_word_importance = []
#place tf-idf values in a pandas data frame 
for tf_idf_vector_id in range(n_articles):
    
    tf_idf_vector=X[tf_idf_vector_id]
    #print (tf_idf_vector.todense().sum())
    #print (tf_idf_vector.T.todense())
    df = pd.DataFrame(tf_idf_vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"]) 
    df_word_importance = df.sort_values(by=["tfidf"],ascending=False)
    word_importance_list = np.array(df_word_importance.index)
    collect_word_importance.append(word_importance_list)


In [None]:
# Each line corresponds to the highest scored words in the article of same index.
collect_word_importance = np.array(collect_word_importance)
collect_word_importance

In [None]:
# TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
# TfidfTransformer
#TfidfTransformer(*, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]


corpus = ['this is the first document',
           'this document is the second document',
          'and this is the third one',
           'is this the first document']
vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
               'and', 'one']
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                  ('tfid', TfidfTransformer())]).fit(corpus)
pipe['count'].transform(corpus).toarray()
pipe['tfid'].idf_
pipe.transform(corpus).shape


In [None]:
pipe = Pipeline([('count', CountVectorizer()),
                  ('tfid', TfidfTransformer())]).fit(clean_articles)
pipe['count'].transform(clean_articles).toarray().shape
print (pipe['tfid'].idf_)
Tfidf_res = pipe.transform(clean_articles)
Tfidf_res.shape

In [None]:
Tfidf_res

In [None]:
#### Tutorial

#Dataset and Imports
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences 
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [None]:
# Initialize CountVectorizer
#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
# 5 texts, 9 distinct words -> gives the count for each word in each text

In [None]:
word_count_vector[0]

In [None]:
#Compute the IDF values
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

In [None]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

In [None]:
# Compute the TFIDF score for your documents
# count matrix 
count_vector=cv.transform(docs) #<==> word_count_vector

# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:
tf_idf_vector

In [None]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for FFFFFFFFFirst document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores (Tf-idf scores of first document)
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
# Tfidfvectorizer Usage - Compute all at Once

 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
#fitted_vectorizer=tfidf_vectorizer.fit(docs)               # This method would work too
#tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)  
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [None]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)
