In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import re
import pkg_resources
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import *
import numpy as np
import nltk
from langdetect import detect



In [2]:
tweets_df = pd.read_pickle("tweets2.pkl")
tweets_df.head()

Unnamed: 0,tweet
0,Murió una mujer de 99 años por el coronavirus....
1,#GOPIdiots is correct https://t.co/QBrncMW2VL
2,Only took 5 https://t.co/1h0tqHaAZX
3,RT @NewDay: Japan’s infection rate is under sc...
4,RT @MyFavsTrash: This is the most Florida twee...


In [11]:
tweets_df.shape

(252000, 1)

In [3]:
def replaceRT(tweet):
    return(tweet.replace("RT", ""))



links_regex = re.compile(r'http\S+')
def replaceLinks(tweet):
    
    return(re.sub(links_regex, '', tweet))


twitter_regex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)')
def replaceHandle(tweet):
    
    return(re.sub(twitter_regex, '', tweet))

In [4]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceRT(x))
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceHandle(x))
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceLinks(x))
tweets_sampled = tweets_df.sample(500)
tweets_sampled = tweets_sampled.reset_index(drop=True)
tweets_sampled

Unnamed: 0,tweet
0,": This video is ""manipulated"" to make it seem..."
1,": As the coronavirus spreads globally, packed..."
2,: Did u forget to mention Sen Feinstein’s ins...
3,They rubber stamped China's lies
4,: How much more fascist can Johnson get - im...
...,...
495,: Wait sooo... the left considers Greta Thunb...
496,: Luego de ver este video que contradice todo...
497,: Cancelling play day ...What’s up ..You guys...
498,Better seat beside Dusbin instead to sit besid...


In [5]:
stemmer = PorterStemmer()

nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['coronavirus', 'Koronavirus', 'trump', 'covid-19', 'corona', 'covid', 
                  'covid19', 'covd', 'virus', 'pandemic', 'chinese', 'china', 'wuhan', 'ncov'])


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stopwords:
            result.append(stemmer.stem(token))
            
    combined_result = ' '.join(result) ###### this is what i changed from your code, devina ######
    return combined_result

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lcruz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
processed_tweets = []
indexesToRemove = []

for index, tweet in enumerate(tweets_sampled.tweet.tolist()):
    
    try:
        lang = detect(tweet)
        if lang == 'en':
            processed_tweets.append(preprocess(tweet))
        if lang != 'en':
            removeIndex = index
            indexesToRemove.append(removeIndex)
            
    except:
        removeIndex = index
        indexesToRemove.append(removeIndex)
        language = "error"
        print("This tweet throws an error:", tweet)

This tweet throws an error:  : କରୋନା ମୁକାବିଲା : ଏପ୍ରିଲ ୩୦ ଯାଏଁ ବଢିଲା ଲକଡାଉନ ଅବଧି, ସରକାରଙ୍କ ନିଷ୍ପତ୍ତିକୁ ନେଇ ଆପଣଙ୍କ ମତ କଣ? #OdishaFightsCoronaVirus 
#COVID1…
This tweet throws an error:  
This tweet throws an error: 😶
This tweet throws an error:  
This tweet throws an error:  : 
This tweet throws an error: ????! 


In [7]:
tweets_sampled_new = tweets_sampled.drop(tweets_sampled.index[indexesToRemove])
print(len(processed_tweets))

308


In [8]:
# the vectorizer object will be used to transform text to vector form
vectorizer = TfidfVectorizer(max_df=0.95, ngram_range = (1,2), token_pattern='\w+|\$[\d\.]+|\S+')
# apply transformation
tf = vectorizer.fit_transform(processed_tweets) #.toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()
tf.shape 

(308, 3913)

In [9]:
number_of_topics = 10
model = NMF(n_components=number_of_topics, random_state=45) # random state for reproducibility
# Fit data to model
nmf_output = model.fit_transform(tf)

In [10]:
# Show top n keywords for each topic
def show_topics(vectorizer, model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer, model, 15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,test,posit,test posit,tiger,tiger test,british,announc,british member,member parliament,parliament test,break british,parliament,member,first,get
Topic 1,lockdown,day,day lockdown,lockdown day,lockdown itali,itali,soon lockdown,soon,mean lockdown,lockdown mean,mean,day guy,play day,guy still,cancel play
Topic 2,peopl,fuck,got,get,shit,act,keep,call,shit got,got peopl,peopl act,work,everi,back,enemi
Topic 3,home,stay,stay home,want,want stay,home vs,forc stay,vs,vs want,forc,nurs,spread,tri,know,live
Topic 4,case,confirm,case day,go case,death,day,wonder,case wonder,go,case canada,presumpt,presumpt case,confirm presumpt,report,march case
Topic 5,distanc,social,social distanc,time,us,practic social,practic,bad,mean,popul,infect,quarantin,dead,suck,dickstanc mean
Topic 6,like,look,look like,like quarantin,seem like,quarantin,seem,headlin,sound,like headlin,sound like,pass,propos,like pass,propos bill
Topic 7,cancel,everyth except,everyth,cancel everyth,except,weekend,peda weekend,except peda,peda,true,loan,except ptptn,ptptn loan,ptptn,true cancel
Topic 8,say,presid,need,make,american,think,take,anyth,demand,care,crisi,need demand,real,die,health
Topic 9,thread,hous,white hous,white,today,bill,dr,fauci,brief,senat,dr fauci,cnn,republican,break,block
