In [None]:
import pandas as pd
import re
import pkg_resources

In [None]:
tweets_df = pd.read_pickle("tweets.pkl")

In [None]:
tweets_df.head()

In [None]:
def replaceRT(tweet):
    return(tweet.replace("RT", ""))

In [None]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceRT(x))

In [None]:
twitter_regex = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)')

def replaceHandle(tweet):
    
    return(re.sub(twitter_regex, '', tweet))
    

In [None]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: replaceHandle(x))

In [None]:
tweets_sampled = tweets_df.sample(50000)

In [None]:
'''
Loading Gensim and nltk libraries
'''
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [None]:
stemmer = PorterStemmer()

In [None]:
# Tokenize, stem and removing stopwords
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(stemmer.stem(token))
    return result

In [None]:
processed_tweets = []
for tweet in tweets_sampled.tweet.tolist():
    processed_tweets.append(preprocess(tweet))

In [None]:
dictionary = gensim.corpora.Dictionary(processed_tweets)

In [None]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(tweet) for tweet in processed_tweets]

### TESTING LDA ALGORITHM

In [None]:
# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 4, id2word = dictionary, passes = 10)

In [None]:
import pyLDAvis
import pyLDAvis.gensim
for_viz = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(for_viz)

### TESTING NMF ALGORITHM

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('coronavirus')   #APPEND WHATEVER "STOPWORDS" NEEDED

tfidf_vectorizer = TfidfVectorizer(stop_words= stopwords)
tfidf = tfidf_vectorizer.fit_transform(tweets_sampled['tweet'])
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [None]:
nmf = NMF(n_components= 3, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [None]:
topic_values = nmf.transform(tfidf)
df_tweets['Topic'] = topic_values.argmax(axis=1)

In [None]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')