In [31]:
# Imports
import pandas as pd
import json
import re
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clement_turcan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/clement_turcan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Remove bad articles

In [3]:
# Read the json file line by line and append to the dict
data = []
with open('./data/20200420_20200714_business_articles.json') as f:
    for line in f:
        data.append(json.loads(line))


In [4]:
len(data)

416307

In [5]:
# extracting full_text of each article:
articles = []
for line in data:
    articles.append(str(line.get('full-text', None)))

In [13]:
# extracting full_text of each article:
articles = []
min_length = 1000000
min_article_size = 2000
for article in data:
    plain_text = article.get('full-text')
    if (plain_text and "Article `download()` failed" != plain_text[:27] and "Please enable cookies" != plain_text[:21] and len(plain_text)>min_article_size):
        articles.append(plain_text)

In [7]:
len(articles)

215039

## Tokenize the text

In [15]:
# Remove every non-letter character
clean_articles = []
for article in articles[:100]:
    clean_text = article.lower()
    clean_text = re.sub("\r", ' ', clean_text)
    clean_text = re.sub("\n", ' ', clean_text)
    clean_text = re.sub("[^a-z]", ' ', clean_text)
    clean_text = re.sub(r"\s+", ' ', clean_text)
    clean_articles.append(clean_text)
clean_articles[0]

'eliminated masterchef contestant harry foster has hit back at unfair criticism against judge melissa leong the show s first female judge has faced a barrage of trolling with haters taking aim at everything from her behaviour on set to her fashion sense despite being eliminated on tuesday night s episode harry had nothing but good things to say about the melbourne based food writer this could not be further from the truth eliminated masterchef australia contestant harry foster pictured has hit back at unfair criticism against judge melissa leong she s a queen i love her harry told huffpost australia she is energetic passionate and really just vibrant when asked about accusations melissa was rude and biased on the show he said this could not be further from the truth all three judges have received an overwhelmingly positive response from fans but melissa has copped a backlash from a vocal minority she s a queen the show s first female judge has faced a barrage of trolling with haters ta

In [18]:
# Create stopwords list
stopW = stopwords.words('english')
newStopWords = ['r', 't', 'n']
stopW.extend(newStopWords)

tokenized_text = []
for i, article in enumerate(clean_articles):
    token_article = word_tokenize(article)
    token_article_wo_stopwords = [word for word in token_article if word not in stopW]
    tokenized_text.append(token_article_wo_stopwords)

In [19]:
tokenized_text

[['eliminated',
  'masterchef',
  'contestant',
  'harry',
  'foster',
  'hit',
  'back',
  'unfair',
  'criticism',
  'judge',
  'melissa',
  'leong',
  'show',
  'first',
  'female',
  'judge',
  'faced',
  'barrage',
  'trolling',
  'haters',
  'taking',
  'aim',
  'everything',
  'behaviour',
  'set',
  'fashion',
  'sense',
  'despite',
  'eliminated',
  'tuesday',
  'night',
  'episode',
  'harry',
  'nothing',
  'good',
  'things',
  'say',
  'melbourne',
  'based',
  'food',
  'writer',
  'could',
  'truth',
  'eliminated',
  'masterchef',
  'australia',
  'contestant',
  'harry',
  'foster',
  'pictured',
  'hit',
  'back',
  'unfair',
  'criticism',
  'judge',
  'melissa',
  'leong',
  'queen',
  'love',
  'harry',
  'told',
  'huffpost',
  'australia',
  'energetic',
  'passionate',
  'really',
  'vibrant',
  'asked',
  'accusations',
  'melissa',
  'rude',
  'biased',
  'show',
  'said',
  'could',
  'truth',
  'three',
  'judges',
  'received',
  'overwhelmingly',
  'posit

In [22]:
# Create the Word2Vec Model
word2vec = Word2Vec(tokenized_text, min_count=2)
sim_words = word2vec.wv.most_similar('australia')
sim_words

[('said', 0.9930940866470337),
 ('new', 0.9927358031272888),
 ('back', 0.9925839900970459),
 ('u', 0.9925734996795654),
 ('government', 0.9925510287284851),
 ('people', 0.9924513697624207),
 ('lockdown', 0.9923855066299438),
 ('year', 0.9923778176307678),
 ('first', 0.9923532009124756),
 ('law', 0.992314338684082)]

In [24]:
# Build a TfidfVectorizer model
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(text):
  return text

tfidf_vec = TfidfVectorizer(tokenizer=identity_tokenizer ,lowercase=False)
X = tfidf_vec.fit_transform(tokenized_text)

In [36]:
# Print the tfidf values for the first doc (article)
first_article_tfidf = X[0]

# Place tf-idf values in a pandas DataFrame 
df = pd.DataFrame(first_article_tfidf.T.todense(), 
                  index=tfidf_vec.get_feature_names(), 
                  columns=["tfidf"]) 
# Sort the value in ascending order
df = df.sort_values(by=["tfidf"],ascending=False)
# Show top 5 most important words in the article in relation to the corpus
df.head()

Unnamed: 0,tfidf
melissa,0.41675
masterchef,0.312563
harry,0.260469
judge,0.223788
leong,0.208375
