## Purpose: Topic Modeling on Tweets

### Installations

In [1]:
!pip install pyLDAvis



### Ignore warnings

In [10]:
import warnings
warnings.filterwarnings('always', category=DeprecationWarning)

### Cleaned Dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_excel('sample_cleaned.xlsx', index_col = 0)
df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,tweet_lang,tweet_new
0,2020-10-15 00:00:02,-2147483648,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",...,45.520247,-122.674195,Portland,United States,North America,Oregon,OR,2020-10-21 00:00:00.746,en,"['trump', 'student', 'use', 'hear', 'year', 't..."
1,2020-10-15 00:00:08,-2147483648,You get a tie! And you get a tie! #Trump ‘s ra...,4,3,Twitter for iPhone,47413798,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",...,38.894992,-77.036558,Washington,United States,North America,District of Columbia,DC,2020-10-21 00:00:01.493,en,"['get', 'tie', 'get', 'tie', 'trump', 'ralli',..."
2,2020-10-15 00:00:17,-2147483648,@CLady62 Her 15 minutes were over long time ag...,2,0,Twitter for Android,1138416104,Farris Flagg,FarrisFlagg,#BidenHarris2020 #JoeBiden2020 #KamalaHarrisFo...,...,33.782519,-117.228648,,United States,North America,California,CA,2020-10-21 00:00:01.866,en,"['cladi', 'minut', 'long', 'time', 'ago', 'oma..."
3,2020-10-15 00:00:18,-2147483648,@DeeviousDenise @realDonaldTrump @nypost There...,0,0,Twitter for iPhone,-2147483648,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,"Patriot, Wife, “Shaken not Stirred” Mom of two...",...,40.225357,-82.68814,,United States,North America,Ohio,OH,2020-10-21 00:00:02.613,en,"['deeviousdenis', 'realdonaldtrump', 'nypost',..."
4,2020-10-15 00:00:20,-2147483648,One of the single most effective remedies to e...,0,0,Twitter Web App,540476889,Jamieo,jamieo33,"Don't know what I am. Can lean left and right,...",...,40.969989,-77.727883,,United States,North America,Pennsylvania,PA,2020-10-21 00:00:02.986,en,"['one', 'singl', 'effect', 'remedi', 'erad', '..."


In [5]:
tweets = df['tweet_new']
tweets.head()

0    ['trump', 'student', 'use', 'hear', 'year', 't...
1    ['get', 'tie', 'get', 'tie', 'trump', 'ralli',...
2    ['cladi', 'minut', 'long', 'time', 'ago', 'oma...
3    ['deeviousdenis', 'realdonaldtrump', 'nypost',...
4    ['one', 'singl', 'effect', 'remedi', 'erad', '...
Name: tweet_new, dtype: object

### Topic Modeling by LDA
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

#### Prepare data for LDA Analysis

We start by tokenizing the text and removing stopwords.

In [6]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['vote', 'get', 'twitter', 'elect', 'amp', 'one', 'go', 'like', 'nypost', 'would', 'know', 'presid', 
                   'dont', 'said', 'trump', 'say', 'want', 'post', 'peopl', 'potu', 'million', 'care', 'even', 'stori',
                   'watch', 'realdonaldtrump', 'donaldtrump', 'biden', 'joebiden'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = tweets.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['student', 'use', 'hear', 'year', 'ten', 'year', 'heard', 'china', 'mani', 'ask', 'mani', 'sir', 'um']


Then, we convert the tokenized object into a corpus and dictionary.

In [7]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2)]


### LDA model training

In [8]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"covid" + 0.008*"democrat" + 0.007*"impeach" + 0.006*"bidenharri" + '
  '0.006*"antitrump" + 0.005*"corrupt" + 0.005*"campaign" + 0.004*"joe" + '
  '0.004*"hunterbiden" + 0.004*"son"'),
 (1,
  '0.005*"bidenharri" + 0.005*"year" + 0.005*"lie" + 0.005*"american" + '
  '0.004*"give" + 0.004*"keep" + 0.004*"debat" + 0.004*"support" + '
  '0.004*"corrupt" + 0.004*"time"'),
 (2,
  '0.006*"covid" + 0.005*"news" + 0.004*"year" + 0.004*"support" + 0.004*"nbc" '
  '+ 0.004*"look" + 0.004*"bidenharri" + 0.004*"plea" + 0.004*"never" + '
  '0.003*"coronaviru"'),
 (3,
  '0.012*"covid" + 0.008*"coronaviru" + 0.007*"hunterbiden" + 0.006*"usa" + '
  '0.006*"bidenharri" + 0.005*"rp" + 0.005*"epstein" + 0.005*"hunter" + '
  '0.005*"joe" + 0.005*"ue"'),
 (4,
  '0.008*"hunterbiden" + 0.006*"covid" + 0.005*"lie" + 0.004*"via" + '
  '0.004*"maga" + 0.004*"nbc" + 0.004*"hunter" + 0.004*"china" + 0.004*"see" + '
  '0.004*"corrupt"'),
 (5,
  '0.005*"joe" + 0.005*"corrupt" + 0.005*"day" + 0.005*"to

### Analyzing LDA model results
Refer to the html file in the topic_modeling_results directory for the visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle
import os

# Visualize the topics
pyLDAvis.enable_notebook()
if not os.path.exists('./topic_modeling_results/'):
    os.mkdir('./topic_modeling_results')
LDAvis_data_filepath = os.path.join('./topic_modeling_results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word, R=10)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './topic_modeling_results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared