In [2]:
import pandas as pd
import re
import pkg_resources

In [3]:
tweets_df = pd.read_pickle("tweets.pkl")

In [4]:
tweets_df.head()

Unnamed: 0,tweet
0,RT @AnneKPIX: @CDC has activated its emergency...
1,RT @OurWarOnCancer: Where is our FEDERAL vacci...
2,RT @nytimes: Breaking News: The first U.S. cas...
3,RT @ScottAnthonyUSA: ⚠️ IT SHOULD BE NOTED tha...
4,RT @eugenegu: With the CDC confirming the firs...


In [27]:
def replaceRT(tweet):
    return(tweet.replace("RT", ""))



links_regex = re.compile(r'http\S+')
def replaceLinks(tweet):
    
    return(re.sub(links_regex, '', tweet))


twitter_regex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)')
def replaceHandle(tweet):
    
    return(re.sub(twitter_regex, '', tweet))

In [6]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceRT(x))

In [8]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceHandle(x))

In [28]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceLinks(x))

In [87]:
tweets_sampled = tweets_df.sample(500)
tweets_sampled = tweets_sampled.reset_index(drop=True)

In [10]:
'''
Loading Gensim and nltk libraries
'''
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [11]:
stemmer = PorterStemmer()

In [51]:
# Tokenize, stem and removing stopwords

import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['coronavirus', 'Koronavirus', 'trump', 'covid-19', 'corona', 'covid', 
                  'covid19', 'covd', 'virus', 'pandemic', 'chinese', 'china', 'wuhan'])


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stopwords:
            result.append(stemmer.stem(token))
    return result

[nltk_data] Downloading package stopwords to C:\Users\Devina
[nltk_data]     Parihar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
from langdetect import detect

processed_tweets = []
indexesToRemove = []

for index, tweet in enumerate(tweets_sampled.tweet.tolist()):
    
    try:
        lang = detect(tweet)
        if lang == 'en':
            processed_tweets.append(preprocess(tweet))
        if lang != 'en':
            removeIndex = index
            indexesToRemove.append(removeIndex)
            
    except:
        removeIndex = index
        indexesToRemove.append(removeIndex)
        language = "error"
        print("This tweet throws an error:", tweet)
        

This tweet throws an error: 🤦🏽‍♂️ 
This tweet throws an error:  :         …


In [89]:
tweets_sampled_new = tweets_sampled.drop(tweets_sampled.index[indexesToRemove])


In [91]:
len(processed_tweets)

365

In [78]:
dictionary = gensim.corpora.Dictionary(processed_tweets)

In [84]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(tweet) for tweet in processed_tweets]

In [92]:
# TESTING BIGRAMS

def bigrams(words, bi_min=15):
    bigram = gensim.models.Phrases(words)
    bigram_model = gensim.models.phrases.Phraser(bigram)
    return bigram_model

bigram_model = bigrams(processed_tweets)
bigram = [bigram_model[tweet] for tweet in processed_tweets]

id2word = gensim.corpora.Dictionary(bigram)
id2word.filter_extremes(no_below=10, no_above=0.35)
id2word.compactify()

corpus = [id2word.doc2bow(text) for text in bigram]

In [93]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics = 5, id2word = id2word, passes = 10)

lda_model.print_topics(5,num_words=10)

[(0,
  '0.335*"peopl" + 0.178*"go" + 0.165*"us" + 0.107*"global" + 0.090*"die" + 0.056*"world" + 0.006*"first" + 0.005*"still" + 0.004*"like" + 0.004*"new"'),
 (1,
  '0.157*"case" + 0.156*"use" + 0.147*"test" + 0.139*"first" + 0.135*"spread" + 0.131*"death" + 0.049*"offici" + 0.027*"get" + 0.005*"one" + 0.003*"us"'),
 (2,
  '0.180*"day" + 0.177*"one" + 0.154*"want" + 0.146*"like" + 0.085*"world" + 0.058*"spread" + 0.056*"countri" + 0.044*"get" + 0.030*"report" + 0.008*"first"'),
 (3,
  '0.157*"new" + 0.148*"health" + 0.142*"outbreak" + 0.113*"break" + 0.081*"offici" + 0.070*"still" + 0.069*"case" + 0.058*"report" + 0.043*"like" + 0.042*"global"'),
 (4,
  '0.163*"mask" + 0.162*"mani" + 0.160*"say" + 0.125*"start" + 0.092*"peopl" + 0.078*"countri" + 0.063*"get" + 0.034*"case" + 0.033*"die" + 0.016*"still"')]

In [110]:

#for every english tweet, get the probability it belongs to any of the topics
#save to a new training set

train_probVectors = []

for i in range(len(tweets_sampled_new)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_probabilities = [top_topics[i][1] for i in range(5)]
    train_probVectors.append(topic_probabilities)


[0.07012172, 0.06666885, 0.066677675, 0.72986263, 0.06666914]

### TESTING LDA ALGORITHM

In [55]:
# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 5, id2word = dictionary, passes = 10)

In [56]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.010*"peopl" + 0.008*"outbreak" + 0.008*"amp" + 0.006*"one" + 0.005*"live" + 0.005*"world" + 0.005*"issu" + 0.005*"care" + 0.004*"call" + 0.004*"case"


Topic: 1 
Words: 0.016*"peopl" + 0.010*"break" + 0.010*"death" + 0.008*"outbreak" + 0.007*"case" + 0.005*"see" + 0.005*"new" + 0.005*"spread" + 0.005*"way" + 0.005*"someon"


Topic: 2 
Words: 0.011*"amp" + 0.010*"patient" + 0.008*"day" + 0.008*"case" + 0.008*"confirm" + 0.008*"peopl" + 0.007*"health" + 0.007*"korea" + 0.006*"citi" + 0.005*"break"


Topic: 3 
Words: 0.008*"report" + 0.008*"death" + 0.008*"case" + 0.006*"first" + 0.006*"us" + 0.006*"today" + 0.006*"like" + 0.005*"know" + 0.005*"wrong" + 0.005*"hubei"


Topic: 4 
Words: 0.008*"work" + 0.007*"spread" + 0.006*"prevent" + 0.006*"peopl" + 0.005*"outbreak" + 0.005*"market" + 0.005*"vaccin" + 0.004*"stop" + 0.004*"new" + 0.004*"could"




In [None]:
import pyLDAvis
import pyLDAvis.gensim
for_viz = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(for_viz)

### TESTING NMF ALGORITHM

In [94]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['coronavirus', 'Koronavirus', 'trump', 'covid-19', 'corona', 'covid', 
                  'covid19', 'covd', 'virus', 'pandemic']) #APPEND WHATEVER "STOPWORDS" NEEDED

tfidf_vectorizer = TfidfVectorizer(stop_words= stopwords)
tfidf = tfidf_vectorizer.fit_transform(tweets_sampled_new['tweet'])
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

[nltk_data] Downloading package stopwords to C:\Users\Devina
[nltk_data]     Parihar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  'stop_words.' % sorted(inconsistent))


In [95]:
nmf = NMF(n_components= 5, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [97]:
topic_values = nmf.transform(tfidf)
tweets_sampled_new['Topic'] = topic_values.argmax(axis=1)

In [98]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['health', 'province', '150', 'hubei', 'reports', 'breaking', 'cases', 'deaths', 'new', 'china']


Top 10 words for topic #1:
['many', 'little', 'white', 'say', 'bad', 'anyways', 'overheard', 'girl', 'starbucks', 'ppl']


Top 10 words for topic #2:
['tangshan', 'government', 'via', 'people', 'reason', 'proximity', 'illness', 'regardless', 'racialised', 'chinese']


Top 10 words for topic #3:
['flying', 'wednesday', 'gov', 'personnel', 'wait', 'boot', 'iran', 'hiding', 'save', 'us']


Top 10 words for topic #4:
['case', 'fr', 'manila', 'capital', 'approximately', 'mins', 'hospital', 'san', 'lazaro', 'away']


