# Article Labeling & Lexical Fields Finding

In [1]:
import pandas as pd
import numpy as np
import sys
import re

## Download Unlabelled articles

In [2]:
import json

raw_json_data = []
with open('./data/20200420_20200714_business_articles.json') as f:
    for line in f:
        raw_json_data.append(json.loads(line))


In [3]:
print ("data type",type (raw_json_data))
print ("json",type (raw_json_data[0]))
print ("keys",raw_json_data[0].keys())
print ("length", len(raw_json_data))
#print (raw_json_data[0])

data type <class 'list'>
json <class 'dict'>
keys dict_keys(['published', 'link', 'message', 'Feed', 'title', '@version', 'author', '@timestamp', 'full-text', 'type'])
length 416307


## Fetching Company Names & Related Names(52 companies)

In [4]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].lower() for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies[name] = list(df_tmp["related_name"])
dict_companies.keys()

dict_keys(['21st century fox', 'activision blizzard', 'adobe ', 'advanced micro devices', 'akamai technologies', 'akamai tecnologies', 'alexion pharmaceuticals', 'amazon', 'american airlines group', 'amgen', 'analog devices', 'apple', 'autodesk', 'automatic data processing', 'baidu', 'bed bath & beyond', 'biogen', 'ca technologies', 'celgene', 'cerner', 'cisco ', 'cognizant', 'comcast', 'discovery communications', 'dish network', 'ebay', 'electronic arts', 'equinix', 'expeditors international', 'facebook', 'alphabet', 'intel', 'liberty global', 'liberty interactive', 'linear technology', 'marriott international', 'mattle', 'mattel', 'mckesson ', 'mckesson', 'microsoft', 'netflix', 'nvidia', 'paypal', 'qualcomm', 'starbucks', 'stericycle', 'tesla motors', 'texas instruments', 'the priceline group', 'universal display ', 'universal display'])

## Extracting url, title & full_text of each article:

In [25]:
urls = list()
plain_texts = list()
titles = list()
labels = list()

min_article_size = 2000
for article in raw_json_data:
    plain_text = article.get('full-text')
    title = article.get('title')
    url = article.get('link')
    if (plain_text and "Article `download()` failed" != plain_text[:27] and "Please enable cookies" != plain_text[:21] and len(plain_text)>min_article_size):
        plain_texts.append(plain_text)
        urls.append(url)
        titles.append(title)
        labels.append(list())
       

## Build DataFrame with extacted data

In [49]:
#Statistics
# 358192 removing "Article `download()` failed" 
# 340987 removing "Article `download()` failed" and "Please enable cookies"
# 215039 removing "Article `download()` failed" and "Please enable cookies" and size<min_article_size = 2000
data = np.array([urls,titles, plain_texts, labels]).T
columns=["url", "title", "plain_text", "label"]
df_articles = pd.DataFrame(data=data, columns=columns)

  """


In [50]:
df_articles.head()

Unnamed: 0,url,title,plain_text,label
0,https://www.dailymail.co.uk/tvshowbiz/article-...,MasterChef's Harry Foster hits back at claims ...,Eliminated MasterChef contestant Harry Foster ...,[]
1,https://www.washingtontimes.com/news/2020/jun/...,"Protest arrests logjam tests NYC legal system,...",NEW YORK (AP) - A wave of arrests in the New Y...,[]
2,https://www.dailymail.co.uk/news/article-83114...,Labour's Anneliese Dodds says she will REFUSE ...,A top shadow minister today said there was not...,[]
3,http://feeds.reuters.com/~r/Reuters/worldNews/...,Civil unrest rages in Minneapolis over raciall...,MINNEAPOLIS (Reuters) - Peaceful rallies gave ...,[]
4,https://www.dailymail.co.uk/news/article-82734...,Australia 'beats the cr*p' out of coronavirus ...,Australia is 'beating the c**p' out of coronav...,[]


## Cleaning full_text of articles

In [19]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

In [31]:
# Remove every non-letter/number character
#df_cleaned = df_articles.copy(deep= True)
df_cleaned = df_articles.head(5000).copy(deep= True)
for index, row in df_cleaned.iterrows():
    row["plain_text"] = row["plain_text"].lower()
    row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
    #[:punct:], ,[^0-9], [^a-z]
    #row["plain_text"] = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', row["plain_text"])
    row["plain_text"] = re.sub("[^a-z0-9]", ' ', row["plain_text"])
    #row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0]

'eliminated masterchef contestant harry foster has hit back at unfair criticism against judge melissa leong  the show s first female judge  40  has faced a barrage of trolling  with haters taking aim at everything from her behaviour on set to her fashion sense  despite being eliminated on tuesday night s episode  harry had nothing but good things to say about the melbourne based food writer   this could not be further from the truth   eliminated masterchef australia contestant harry foster  pictured  has hit back at unfair criticism against judge melissa leong  she s a queen  i love her   harry told huffpost australia   she is energetic  passionate and really just vibrant   when asked about accusations melissa was rude and biased on the show  he said   this could not be further from the truth   all three judges have received an overwhelmingly positive response from fans  but melissa has copped a backlash from a vocal minority   she s a queen   the show s first female judge  40  has fac

## Find Stop Words & Removing them from plain text

In [32]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [33]:
# Remove all stop words from plain text
for index, row in df_cleaned.iterrows():
    for stop_word in stop_words:
        row["plain_text"] = re.sub(' '+stop_word+' ', ' ', row["plain_text"])
    row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0]

'eliminated masterchef contestant harry foster hit back unfair criticism judge melissa leong show first female judge 40 faced barrage trolling haters taking aim everything behaviour set fashion sense despite eliminated tuesday night episode harry nothing good things say melbourne based food writer could truth eliminated masterchef australia contestant harry foster pictured hit back unfair criticism judge melissa leong queen love harry told huffpost australia energetic passionate really vibrant asked accusations melissa rude biased show said could truth three judges received overwhelmingly positive response fans melissa copped backlash vocal minority queen show first female judge 40 faced barrage trolling haters taking aim everything behaviour set fashion sense many praised fashion sense positivity others claim waits feedback jock zonfrillo andy allen repeating chance melissa leong original idea masterchef continue wait others tell think dish one viewer tweeted another added new judge m

## Labeling Articles with Company Names 
### Check if Articles Talk of Companies

In [35]:
for index, row in df_cleaned.iterrows(): # initialize labels
    row['label'] = []
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["plain_text"]:
            row['label'].append(company)
        else:
            for related_name in dict_companies[company]:
                if related_name in row["plain_text"]:
                    row['label'].append(company)
                    break
df_cleaned["label"].tail(10)

4990                                 []
4991    [facebook, alphabet, microsoft]
4992                                 []
4993                                 []
4994                        [microsoft]
4995                                 []
4996                                 []
4997                        [microsoft]
4998                                 []
4999                                 []
Name: label, dtype: object

## Count Number of Articles that each Company is Associated to.

In [38]:
# init
dict_count = {}
for company in company_names: dict_count[company]= 0
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["label"]:
            dict_count[company]+=1
dict_count          

companies_w_articles = list()
for company in company_names:
    if dict_count[company]>0:
        companies_w_articles.append(company)
print ("there are %d companies with associated articles over the %d total companies"%(len(companies_w_articles),len(company_names)) )
#dict_count

there are 46 companies with associated articles over the 52 total companies


## tf.idf on Companies that have Associated Articles 

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [47]:


dict_relavant_words = {}
for company in companies_w_articles: # for all companies in companies_w_articles

    tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,3), binary = True)
    plain_text_list = list()
    company_article = ""
    for index, row in df_cleaned.iterrows():
        if company in row["label"]:
            company_article = company_article+ " "+ row["plain_text"]
            plain_text_list.append(row["plain_text"])
    
    plain_text_list.insert(0,company_article)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(plain_text_list)

    #Get the tf-idf scores for the words in the company article complication.
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"],ascending=False).head(20)
    dict_relavant_words[company] = list(df.index)
dict_relavant_words

{'21st century fox': ['films genres disney',
  'happen the world',
  'dangers loom top',
  'company ashen faced',
  'company ashen',
  'star trek perhaps',
  'classic green ottawa',
  'green source mlb',
  'green source',
  'green ottawa reuters',
  'green ottawa',
  'classic green source',
  'llc toronto reuters',
  'llc toronto',
  'llc report state',
  'royal tea palace',
  'royal tea ottawa',
  'llc report',
  'llc pop culture',
  'llc pop'],
 'activision blizzard': ['beyond new',
  'morning open montgomery',
  'filming video',
  'llc activision',
  'llc activision blizzard',
  'year beyond new',
  'week going',
  'week going business',
  'resumed filming video',
  'growth charts abiomed',
  'morning open japanese',
  'kellytyko japanese',
  'open japanese',
  'open japanese electronics',
  'call transcripts johnny',
  'next week going',
  'times llc activision',
  'transcripts johnny',
  'transcripts johnny jj',
  'open montgomery'],
 'adobe ': ['technologist left',
  'growing tec

# Annexe Testing

## Python program to generate word vectors using Word2Vec 

In [8]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

In [9]:
# Apply 2 Word2Vec models to articles   

data = [] 
  
# iterate through each article in the file 
for i in clean_articles: 
    temp = [] 
    # tokenize the article into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 
  
# Print results 
print("Cosine similarity between 'australia' " + 
               "and 'melbourne' - CBOW : ", 
    model1.similarity('melbourne', 'australia')) 

print(model1.wv.most_similar('melbourne'))
    

# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 4) 
  
# Print results 
print("Cosine similarity between 'australia' " +
          "and 'melbourne' - Skip Gram : ", 
    model2.similarity('melbourne', 'australia')) 
print(model2.wv.most_similar('melbourne'))


Cosine similarity between 'australia' and 'melbourne' - CBOW :  0.66155946
[('portland', 0.8813670873641968), ('perez', 0.8812724351882935), ('rigel', 0.8715772032737732), ('heights', 0.8652781844139099), ('jaylen', 0.8628085255622864), ('pool', 0.8600395917892456), ('santa', 0.8597794771194458), ('cincinnati', 0.8594452738761902), ('hollywood', 0.8580102920532227), ('charleston', 0.8569580316543579)]
Cosine similarity between 'australia' and 'melbourne' - Skip Gram :  0.5816691
[('hollywood', 0.8261724710464478), ('capitan', 0.8180453181266785), ('dga', 0.8162583112716675), ('monica', 0.8151893615722656), ('gods', 0.809572696685791), ('augusta', 0.805156946182251), ('sands', 0.8041570782661438), ('citywest', 0.8036929368972778), ('lutheran', 0.8020343780517578), ('hangar', 0.800697922706604)]


In [10]:
# FOR GENSIN USING CBOW Manipulations

# enumerate data it is trained on
for i, word in enumerate(model1.wv.vocab):
    if i == 5:
        break
    print(word)

eliminated
masterchef
contestant
harry
foster


In [11]:
len(data)
# Show frequencies
#print("Original List : ",data)
data_flat = []
for line in data:
    for word in line:
        data_flat.append(word)


ctr = collections.Counter(data_flat)
#print("Frequency of the elements in the List : ",ctr)
ctr["the"] # count of word "the"

84611

## Using tf.itf

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [235]:
# Vectorizer
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(clean_articles)

#print(vectorizer.get_feature_names()[:10])
#print(X.shape)
#print(vectorizer.get_stop_words())
#print(vectorizer.get_params(deep=True))

n_articles, n_distinct_words = X.shape
print(n_articles, n_distinct_words)

collect_word_importance = []
#place tf-idf values in a pandas data frame 
for tf_idf_vector_id in range(n_articles):
    
    tf_idf_vector=X[tf_idf_vector_id]
    #print (tf_idf_vector.todense().sum())
    #print (tf_idf_vector.T.todense())
    df = pd.DataFrame(tf_idf_vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"]) 
    df_word_importance = df.sort_values(by=["tfidf"],ascending=False)
    word_importance_list = np.array(df_word_importance.index)
    collect_word_importance.append(word_importance_list)


2000 29466


KeyboardInterrupt: 

In [14]:
# Each line corresponds to the highest scored words in the article of same index.
collect_word_importance = np.array(collect_word_importance)
collect_word_importance

array([['melissa', 'masterchef', 'leong', ..., 'findlay', 'findings',
        'zuocheng'],
       ['the', 'burglary', 'bail', ..., 'firmware', 'firms', 'zuocheng'],
       ['the', 'to', 'children', ..., 'flaring', 'flareups', 'zuocheng'],
       ...,
       ['the', 'to', 'in', ..., 'firestorm', 'fires', 'zuocheng'],
       ['the', 'to', 'in', ..., 'firestorm', 'fires', 'zuocheng'],
       ['the', 'to', 'in', ..., 'firestorm', 'fires', 'zuocheng']],
      dtype=object)

In [15]:
# TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [16]:
# TfidfTransformer
#TfidfTransformer(*, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]


corpus = ['this is the first document',
           'this document is the second document',
          'and this is the third one',
           'is this the first document']
vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
               'and', 'one']
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                  ('tfid', TfidfTransformer())]).fit(corpus)
pipe['count'].transform(corpus).toarray()
pipe['tfid'].idf_
pipe.transform(corpus).shape


(4, 8)

In [17]:
pipe = Pipeline([('count', CountVectorizer()),
                  ('tfid', TfidfTransformer())]).fit(clean_articles)
pipe['count'].transform(clean_articles).toarray().shape
print (pipe['tfid'].idf_)
Tfidf_res = pipe.transform(clean_articles)
Tfidf_res.shape

[7.21510797 7.90825515 7.90825515 ... 7.90825515 6.99196442 7.21510797]


(2000, 29466)

In [18]:
Tfidf_res

<2000x29466 sparse matrix of type '<class 'numpy.float64'>'
	with 666150 stored elements in Compressed Sparse Row format>

In [240]:
#### Tutorial

#Dataset and Imports
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences 
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [241]:
# Initialize CountVectorizer
#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
# 5 texts, 9 distinct words -> gives the count for each word in each text

(5, 16)

In [243]:
word_count_vector[0]

<1x16 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [21]:
#Compute the IDF values
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [22]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


In [23]:
# Compute the TFIDF score for your documents
# count matrix 
count_vector=cv.transform(docs) #<==> word_count_vector

# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [24]:
tf_idf_vector

<5x16 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [25]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for FFFFFFFFFirst document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores (Tf-idf scores of first document)
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


In [26]:
# Tfidfvectorizer Usage - Compute all at Once

 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
#fitted_vectorizer=tfidf_vectorizer.fit(docs)               # This method would work too
#tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)  
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [27]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)


Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0
