## Purpose: Topic Modeling on Tweets

### Installations

In [None]:
!pip install pyLDAvis

### Ignore warnings

In [1]:
import warnings
# warnings.filterwarnings('always', category=DeprecationWarning)
warnings.filterwarnings('ignore')

### Cleaned Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_excel('sample_cleaned.xlsx', index_col = 0)
# df = pd.read_excel('sample_cleaned1.xlsx', index_col = 0)
df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,tweet_lang,tweet_new
0,2020-10-15 00:00:02,-2147483648,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",...,45.520247,-122.674195,Portland,United States,North America,Oregon,OR,2020-10-21 00:00:00.746,en,"['trump', 'student', 'use', 'hear', 'year', 't..."
1,2020-10-15 00:00:08,-2147483648,You get a tie! And you get a tie! #Trump ‘s ra...,4,3,Twitter for iPhone,47413798,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",...,38.894992,-77.036558,Washington,United States,North America,District of Columbia,DC,2020-10-21 00:00:01.493,en,"['get', 'tie', 'get', 'tie', 'trump', 'ralli',..."
2,2020-10-15 00:00:17,-2147483648,@CLady62 Her 15 minutes were over long time ag...,2,0,Twitter for Android,1138416104,Farris Flagg,FarrisFlagg,#BidenHarris2020 #JoeBiden2020 #KamalaHarrisFo...,...,33.782519,-117.228648,,United States,North America,California,CA,2020-10-21 00:00:01.866,en,"['cladi', 'minut', 'long', 'time', 'ago', 'oma..."
3,2020-10-15 00:00:18,-2147483648,@DeeviousDenise @realDonaldTrump @nypost There...,0,0,Twitter for iPhone,-2147483648,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,"Patriot, Wife, “Shaken not Stirred” Mom of two...",...,40.225357,-82.68814,,United States,North America,Ohio,OH,2020-10-21 00:00:02.613,en,"['deeviousdenis', 'realdonaldtrump', 'nypost',..."
4,2020-10-15 00:00:20,-2147483648,One of the single most effective remedies to e...,0,0,Twitter Web App,540476889,Jamieo,jamieo33,"Don't know what I am. Can lean left and right,...",...,40.969989,-77.727883,,United States,North America,Pennsylvania,PA,2020-10-21 00:00:02.986,en,"['one', 'singl', 'effect', 'remedi', 'erad', '..."


In [4]:
tweets = df['tweet_new']
tweets.head()

0    ['trump', 'student', 'use', 'hear', 'year', 't...
1    ['get', 'tie', 'get', 'tie', 'trump', 'ralli',...
2    ['cladi', 'minut', 'long', 'time', 'ago', 'oma...
3    ['deeviousdenis', 'realdonaldtrump', 'nypost',...
4    ['one', 'singl', 'effect', 'remedi', 'erad', '...
Name: tweet_new, dtype: object

### Topic Modeling by LDA
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

#### Prepare data for LDA Analysis

We start by tokenizing the text and removing stopwords.

In [15]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['vote', 'get', 'twitter', 'elect', 'amp', 'one', 'go', 'like', 'nypost', 'would', 'know', 'presid', 
                   'dont', 'said', 'trump', 'say', 'want', 'post', 'peopl', 'potu', 'million', 'care', 'even', 'stori',
                   'watch', 'realdonaldtrump', 'donaldtrump', 'biden', 'joebiden', 
                  'joe', 'day', 'america', 'american', 'need', 'make', 'donald', 'usa', 'year', ]) #newly added stopwords
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = tweets.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to C:\Users\Lu
[nltk_data]     Xinyi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['student', 'use', 'hear', 'ten', 'heard', 'china', 'mani', 'ask', 'mani', 'sir', 'um']


Then, we convert the tokenized object into a corpus and dictionary.

In [16]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


### LDA model training

In [17]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.009*"covid" + 0.006*"famili" + 0.005*"maga" + 0.005*"hunter" + '
  '0.004*"see" + 0.004*"new" + 0.004*"via" + 0.004*"poll" + 0.003*"corrupt" + '
  '0.003*"iowa"'),
 (1,
  '0.008*"covid" + 0.007*"son" + 0.006*"hunter" + 0.005*"support" + '
  '0.005*"republican" + 0.005*"time" + 0.004*"win" + 0.004*"coronaviru" + '
  '0.004*"democrat" + 0.003*"lie"'),
 (2,
  '0.008*"democrat" + 0.005*"bidenharri" + 0.005*"hunterbiden" + 0.004*"maga" '
  '+ 0.004*"covid" + 0.004*"right" + 0.003*"uselect" + 0.003*"win" + '
  '0.003*"see" + 0.003*"countri"'),
 (3,
  '0.012*"covid" + 0.009*"bidenharri" + 0.007*"antitrump" + 0.006*"plea" + '
  '0.005*"coronaviru" + 0.004*"dumptrump" + 0.004*"blm" + 0.004*"kamalaharri" '
  '+ 0.004*"democrat" + 0.004*"job"'),
 (4,
  '0.008*"covid" + 0.005*"debat" + 0.005*"win" + 0.004*"harri" + 0.004*"news" '
  '+ 0.004*"bidenharri" + 0.003*"keep" + 0.003*"lie" + 0.003*"kamalaharri" + '
  '0.003*"never"'),
 (5,
  '0.006*"time" + 0.005*"hunterbiden" + 0.005*"medium" +

### Analyzing LDA model results
Refer to the html file in the topic_modeling_results directory for the visualization

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle
import os

# Visualize the topics
pyLDAvis.enable_notebook()
if not os.path.exists('./topic_modeling_results/'):
    os.mkdir('./topic_modeling_results')
LDAvis_data_filepath = os.path.join('./topic_modeling_results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word, R=10)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './topic_modeling_results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

ModuleNotFoundError: No module named 'pyLDAvis'

### Compare pre- and post-election day tweets

In [8]:
pre = pd.read_excel('sample_pre.xlsx', index_col = 0)
post = pd.read_excel('sample_post.xlsx', index_col = 0)
pre.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,tweet_lang,tweet_new
0,2020-11-02 06:48:23,-2147483648,"When you write Eric Ciaramella on @Facebook, y...",3,2,Twitter for Android,-2147483648,N😈,MN1202FH,#手足互科,...,22.279328,114.162813,Hong Kong Island,China,Asia,Hong Kong,,2020-11-04 11:02:52.899,en,"['write', 'eric', 'ciaramella', 'facebook', 'd..."
1,2020-11-02 17:46:20,-2147483648,#Demonrats I mean #Democrats are at it AGAIN! ...,0,0,Twitter for iPhone,-2147483648,Tyree Deshane,c4n4bisking,28yr old business minded guy who is all about ...,...,39.952724,-75.163526,Philadelphia,United States,North America,Pennsylvania,PA,2020-11-03 14:26:57.742,en,"['demonrat', 'mean', 'democrat', 'tryna', 'fud..."
2,2020-10-29 17:24:18,-2147483648,🇺🇸 #US President Donald #Trump condemns the #N...,1,0,Twitter Web App,-2147483648,Breaking the News 24/7,Breaking24Seven,"Breaking news in markets, economy, business, p...",...,48.208354,16.372504,Vienna,Austria,Europe,,,2020-10-30 14:48:46.260,en,"['', 'u', 'presid', 'donald', 'trump', 'condem..."
3,2020-10-31 17:52:45,-2147483648,President @realDonaldTrump is Fighting against...,0,0,Twitter for iPhone,19069046,Denise,1AmericanGirl,"USA Patriot 4 Freedom & Liberty, Our USConstit...",...,39.78373,-100.445882,,United States,North America,,,2020-11-01 10:53:06.250,en,"['presid', 'realdonaldtrump', 'fight', 'corrup..."
4,2020-10-29 23:22:09,-2147483648,#Trump2020 #MAGA #Blexit #KAG #WalkAway #Trump...,0,0,Twitter for Android,-2147483648,Shaun Ellis💯 🇺🇸👌,ShaunDEllis1981,Born in Stl MO Now live in NH\nConservative wh...,...,34.096676,-117.719779,,United States,North America,California,CA,2020-10-30 14:47:29.027,en,"['trump', 'maga', 'blexit', 'kag', 'walkaway',..."


In [9]:
post.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,tweet_lang,tweet_new
0,2020-11-07 20:53:53,-2147483648,B+I+D+E+N+ date +month+Donald (D)\n2+9+4+5+14+...,2,0,Twitter for Android,-2147483648,लॉर्ड एसपी गौतम,gautam_subbu,"jay Bhim,\nupsc aspirant,Ex resonite,\nmy idea...",...,28.613895,77.209006,New Delhi,India,Asia,Delhi,DL,2020-11-08 10:40:48.300,en,"['biden', 'date', 'monthdonald', 'wahmodijiwah..."
1,2020-11-08 03:09:25,-2147483648,@JoeBiden those there from #India send them ba...,0,0,Twitter for Android,238099048,iDictator911🏹,swap18plus,#animallove #naturelove #computerengg.,...,21.149813,79.082056,Nagpur,India,Asia,Maharashtra,MH,2020-11-09 17:55:17.241,en,"['joebiden', 'india', 'send', 'back', 'prais',..."
2,2020-11-07 22:22:55,-2147483648,#Trump calls 306 electoral votes “a massive la...,1,1,Twitter for iPhone,19215349,Willie Chambers,williechambers,🏄🏻‍♂️🎸🎹⚽️SolanaBeachCA2Catalina2CostaRica/Work...,...,32.99056,-117.269132,,United States,North America,California,CA,2020-11-08 10:44:53.877,en,"['trump', 'call', 'elector', 'vote', 'massiv',..."
3,2020-11-06 03:38:52,-2147483648,鉴于港🐛和🐸🐸这么喜爱川建国，我有个不成熟的想法。万一建国不幸输了，可以参考果党可以移民去台...,1,0,Twitter for iPhone,-2147483648,@大文豪方方钦定极左误国七毛🇨🇳🇨🇳（台辣妹本辣）,cmhpye,中国大陆地区土生土长本土人民，支持蔡省长打倒国民党反动派，并呼吁中国共产党应给民进党给予口头...,...,25.040952,121.613177,Taipei,Taiwan,Asia,,,2020-11-07 11:34:37.301,en,"['鉴于港', '和', '这么喜爱川建国', '我有个不成熟的想法', '万一建国不幸输了..."
4,2020-11-05 23:50:14,-2147483648,He ain't leaving is he?\n Wave goodbye! #Donal...,0,0,Twitter for Android,90998306,Jason webb,wordsbquick,Spark you like and Dad of 4!,...,51.507322,-0.127647,London,United Kingdom,Europe,England,ENG,2020-11-06 09:47:33.111,en,"['aint', 'leav', 'wave', 'goodby', 'donaldtrum..."


In [10]:
# do we need to tokenize text and remove stopwords again?

pre_tweets = pre['tweet_new'].to_list()
post_tweets = post['tweet_new'].to_list()

pre_words = list(sent_to_words(pre_tweets))
pre_words = remove_stopwords(pre_words)
post_words = list(sent_to_words(post_tweets))
post_words = remove_stopwords(post_words)

**Pre-election day tweets**

In [18]:
#convert tokenized object into corpus and dictionary

id2word_pre = corpora.Dictionary(pre_words)
corpus_pre = [id2word_pre.doc2bow(text) for text in pre_words]
print(corpus_pre[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


In [19]:
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus = corpus_pre, 
                                      id2word = id2word_pre, 
                                      num_topics = num_topics)

# print keyword in the topics
pprint(lda_model.print_topics())

[(0,
  '0.009*"democrat" + 0.009*"covid" + 0.005*"poll" + 0.004*"lie" + '
  '0.004*"bidenharri" + 0.004*"think" + 0.004*"make" + 0.004*"support" + '
  '0.004*"america" + 0.004*"maga"'),
 (1,
  '0.005*"maga" + 0.005*"covid" + 0.005*"bidenharri" + 0.004*"uselect" + '
  '0.004*"kamalaharri" + 0.004*"america" + 0.004*"joe" + 0.003*"time" + '
  '0.003*"could" + 0.003*"poll"'),
 (2,
  '0.007*"support" + 0.007*"covid" + 0.006*"maga" + 0.005*"usa" + '
  '0.005*"bidenharri" + 0.005*"win" + 0.004*"uselect" + 0.004*"american" + '
  '0.004*"state" + 0.004*"joe"'),
 (3,
  '0.008*"bidenharri" + 0.006*"covid" + 0.005*"maga" + 0.005*"joe" + '
  '0.005*"win" + 0.004*"america" + 0.004*"make" + 0.004*"voter" + '
  '0.004*"support" + 0.003*"think"'),
 (4,
  '0.008*"bidenharri" + 0.007*"covid" + 0.007*"win" + 0.007*"america" + '
  '0.007*"day" + 0.005*"need" + 0.005*"joe" + 0.004*"year" + 0.004*"usa" + '
  '0.003*"let"')]


In [None]:
### cant really tell the topics 
#should we still do topic modeling??

**Post-election day tweets**

In [20]:
id2word_post = corpora.Dictionary(post_words)
corpus_post = [id2word_post.doc2bow(text) for text in post_words]
print(corpus_post[:1][0][:30])

[(0, 1), (1, 1), (2, 1)]


In [21]:
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus = corpus_post, 
                                      id2word = id2word_post, 
                                      num_topics = num_topics)

# print keyword in the topics
pprint(lda_model.print_topics())

[(0,
  '0.006*"usa" + 0.006*"american" + 0.006*"maga" + 0.006*"bidenharri" + '
  '0.005*"usaelect" + 0.005*"america" + 0.005*"count" + 0.005*"uselect" + '
  '0.005*"world" + 0.004*"year"'),
 (1,
  '0.017*"win" + 0.013*"electionresult" + 0.010*"kamalaharri" + 0.008*"state" '
  '+ 0.006*"uselect" + 0.006*"bidenharri" + 0.005*"democrat" + 0.005*"need" + '
  '0.005*"good" + 0.004*"republican"'),
 (2,
  '0.008*"america" + 0.008*"uselect" + 0.005*"georgia" + 0.005*"make" + '
  '0.005*"see" + 0.005*"count" + 0.005*"american" + 0.005*"bidenharri" + '
  '0.005*"kamalaharri" + 0.004*"lead"'),
 (3,
  '0.008*"call" + 0.007*"bidenharri" + 0.007*"uselect" + 0.006*"electionday" + '
  '0.006*"america" + 0.005*"usa" + 0.005*"usaelect" + 0.005*"state" + '
  '0.004*"joe" + 0.004*"donald"'),
 (4,
  '0.019*"uselect" + 0.013*"win" + 0.010*"bidenharri" + 0.008*"joe" + '
  '0.006*"kamalaharri" + 0.005*"state" + 0.005*"electionday" + '
  '0.005*"electionresult" + 0.004*"america" + 0.004*"bidenhari"')]


In [None]:
## cant really tell the topics too
# might need to resample dataset