In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy (for lemmatization only)
import spacy

# plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# Prepare Stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

In [4]:
# Import Datataset
df = pd.read_csv(r'C:\Users\Lenovo\forward_school\FNI-project\Task2-Data_analysis_visualization-[EDA]\cleaned_lemmatized_tweets_data.csv')
df

Unnamed: 0.1,Unnamed: 0,id,date,tweet,hashtags,username,place,geo,timezone,parse_tweet
0,0,1353492222344392707,2021-01-25 07:57:47,idk bothering alcohol inflation time high rn,[],MphoKgosidialwa,,,800,idk bother alcohol inflation time high rn
1,1,1353492195827965952,2021-01-25 07:57:40,inflation much money means prices rise eventua...,[],AllenWi92526840,,,800,inflation much money mean price rise eventuall...
2,2,1353492090311720960,2021-01-25 07:57:15,exerts daily fix talk inflation roll deafening...,"['fomc', 'trading']",ChrisWeston_PS,,,800,exert daily fix talk inflation roll deafen exu...
3,3,1353492059370479617,2021-01-25 07:57:08,regards michellins explains recent price rises...,[],morejunkfromu,,,800,regard michellin explain recent price rise inf...
4,4,1353492005008113665,2021-01-25 07:56:55,oh mike since clearly unaware inflation levels...,[],apecapital,,,800,oh mike since clearly unaware inflation level ...
...,...,...,...,...,...,...,...,...,...,...
503,861,1430316314158006274,2021-08-25 07:49:17,dollar day years adding inflation make dollar...,[],1noTon1,,,800,dollar day year add inflation make dollar ye...
504,862,1430316301457764353,2021-08-25 07:49:14,obviously felt inflation gas price increase wh...,[],JeffLaBoda6,,,800,obviously feel inflation gas price increase wh...
505,863,1430316275729903629,2021-08-25 07:49:08,ted sorry never studied history gerald ford wi...,[],TxMelinda,,,800,ted sorry never study history gerald ford with...
506,864,1430316167609020423,2021-08-25 07:48:43,inflation running hot,[],mawfunx2,,,800,inflation run hot


In [5]:
# Convert column 'parse tweet' to list
data = df.parse_tweet.values.tolist()

data

['idk bother alcohol inflation time high rn',
 'inflation much money mean price rise eventually gas gallon',
 'exert daily fix talk inflation roll deafen exuberance equity market sin consensus camp see sign real yield rise amp fed tolerate steep curve point usd fomc trading',
 'regard michellin explain recent price rise inflation aye',
 'oh mike since clearly unaware inflation level cheat sheet',
 'inflation trudeau care finance priority feminist gender gay right abortion progressive leftist political correctness wake climate diversity refugee migrant debt deficit radar',
 'reserve mean anything inflation',
 'macri try volker thing embrace full dollarization despite borrow enough usd peronist world mmt er lower rate assertion true argentina inflation fall cut rate',
 'people low skill welfare food stamp raise minimum every state every state need cost living either cause inflation well',
 'oh yeah buy eth hedge inflation bitcoin oh yeah eth hedge btc uselessness',
 'hey answer question 

In [6]:
pprint(data[:1])

['idk bother alcohol inflation time high rn']


In [7]:
# Tokenize words and cleanup text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations (deaccent)

data_words = list(sent_to_words(data))
print(data_words[:1])


[['idk', 'bother', 'alcohol', 'inflation', 'time', 'high', 'rn']]


In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [9]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['idk', 'bother', 'alcohol', 'inflation', 'time', 'high', 'rn']


In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostopwords = remove_stopwords(data_words)

# Create Bigrams
data_words_bigrams = make_bigrams(data_words_nostopwords)

In [12]:
# initialize spacy 'en_core_web_sm' model and only use tagger to preserve time/memory
nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])

In [13]:
data_words_bigrams[:1]

[['idk', 'bother', 'alcohol', 'inflation', 'time', 'high', 'rn']]

In [14]:
# Do lemmatization
data_lemmatized = lemmatization(data_words_bigrams,allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [15]:
print(data_lemmatized[:1])

[['bother', 'alcohol', 'inflation', 'time', 'high']]


In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Create Term Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [17]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]

[[('alcohol', 1), ('bother', 1), ('high', 1), ('inflation', 1), ('time', 1)],
 [('inflation', 1),
  ('eventually', 1),
  ('gallon', 1),
  ('gas', 1),
  ('mean', 1),
  ('money', 1),
  ('much', 1),
  ('price', 1),
  ('rise', 1)],
 [('inflation', 1),
  ('rise', 1),
  ('amp', 1),
  ('camp', 1),
  ('consensus', 1),
  ('curve', 1),
  ('daily', 1),
  ('equity', 1),
  ('exert', 1),
  ('exuberance', 1),
  ('fix', 1),
  ('fomc', 1),
  ('market', 1),
  ('point', 1),
  ('real', 1),
  ('roll', 1),
  ('see', 1),
  ('sign', 1),
  ('sin', 1),
  ('steep', 1),
  ('talk', 1),
  ('tolerate', 1),
  ('trading', 1),
  ('usd', 1),
  ('yield', 1)],
 [('inflation', 1),
  ('price', 1),
  ('rise', 1),
  ('aye', 1),
  ('explain', 1),
  ('michellin', 1),
  ('recent', 1),
  ('regard', 1)],
 [('inflation', 1),
  ('cheat', 1),
  ('clearly', 1),
  ('level', 1),
  ('sheet', 1),
  ('unaware', 1)],
 [('inflation', 1),
  ('abortion', 1),
  ('care', 1),
  ('climate', 1),
  ('correctness', 1),
  ('debt', 1),
  ('deficit', 1)

In [18]:
# Building the Topic Model using LDA - fit the model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, update_every=1, chunksize=100,passes=10, alpha='auto',per_word_topics=True)

In [19]:
pprint(lda_model.print_topics())

[(0,
  '0.046*"increase" + 0.043*"inflation" + 0.020*"continue" + 0.020*"home" + '
  '0.019*"shoot" + 0.018*"remove" + 0.017*"moon" + 0.016*"price" + 0.009*"see" '
  '+ 0.009*"big"'),
 (1,
  '0.053*"inflation" + 0.042*"dollar" + 0.034*"year" + 0.029*"back" + '
  '0.016*"take" + 0.016*"already" + 0.016*"month" + 0.015*"make" + '
  '0.015*"live" + 0.014*"feed"'),
 (2,
  '0.054*"inflation" + 0.029*"get" + 0.026*"gas" + 0.025*"amp" + 0.020*"price" '
  '+ 0.017*"also" + 0.015*"debt" + 0.015*"people" + 0.015*"government" + '
  '0.014*"obviously"'),
 (3,
  '0.037*"inflation" + 0.022*"hit" + 0.021*"ago" + 0.010*"market" + '
  '0.010*"policy" + 0.008*"bitcoin" + 0.008*"long" + 0.007*"tell" + '
  '0.007*"business" + 0.007*"monetary"'),
 (4,
  '0.110*"inflation" + 0.019*"add" + 0.018*"pay" + 0.014*"low" + 0.014*"food" '
  '+ 0.013*"buy" + 0.013*"stop" + 0.012*"day" + 0.012*"leave" + 0.012*"run"')]


In [20]:
doc_lda = lda_model[corpus] ## transform the model

In [21]:
# Compute Model Perplexity and Coherence Score
# Judge how too the given topic is

# Perplexity Score. The lower, the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Coherence Score. The higher, the better.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.590123206077279

Coherence Score:  0.4121955896734034


In [22]:
# Visualize topic-keywords

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds')
vis