# Preliminary "Base" Topic Modeling
Citation:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
from src.db_client import DBClient
import pandas as pd
from gensim import corpora, models
import ast

Here's the client!


In [2]:
db = DBClient(secrets_path = "configs/db_secrets.json")

Connected to political tweets DB


In [3]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 10000)
select tweet_text_clean 
from random_tweets;
"""

In [4]:
def get_word_counts(bow_corpus):
    counts = {}
    for bow in bow_corpus:
        for word in bow:
            if word[0] not in counts.keys():
                counts[word[0]] = 0
            counts[word[0]] += word[1]
    return [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]

def print_word_counts(word_counts, num_words):
    for tup in word_counts[0:num_words]:
        print(f"{dem_dict[tup[0]]}, {tup[1]} times")

## Democrats

### Preparation

In [5]:
dem_tweets = pd.DataFrame(db.read(data_query.format("democrat")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [6]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [7]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50)

#resist, 3490 times
#resistance, 1308 times
#bluewave, 1208 times
#voteblue, 1190 times
#democrats, 1188 times
#bluewave2018, 1101 times
vote, 993 times
@realdonaldtrump, 979 times
amp, 904 times
trump, 903 times
get, 756 times
go, 580 times
like, 575 times
#theresistance, 550 times
people, 526 times
dont, 473 times
#maga, 473 times
make, 444 times
#votethemout, 431 times
need, 422 times
follow, 419 times
#trump, 409 times
say, 385 times
one, 378 times
want, 373 times
know, 362 times
time, 360 times
#fbr, 341 times
good, 340 times
take, 333 times
let, 328 times
#impeachtrump, 325 times
back, 316 times
right, 308 times
come, 294 times
think, 288 times
see, 286 times
#vote, 280 times
please, 263 times
america, 262 times
democrat, 260 times
day, 259 times
would, 256 times
country, 250 times
lie, 248 times
#trumprussia, 248 times
@gop, 243 times
im, 243 times
election, 224 times
party, 221 times


### Train Models

In [8]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [9]:
dem_lda_model = models.LdaMulticore(dem_bow_corpus, num_topics=10, id2word=dem_dict, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf, num_topics=10, id2word=dem_dict, passes=2, workers=4)

### Print Categories

In [10]:
for idx, topic in dem_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"#resist" + 0.024*"#democrats" + 0.023*"#voteblue" + 0.021*"@realdonaldtrump" + 0.019*"#vote" + 0.015*"vote" + 0.013*"#bluewave" + 0.011*"say" + 0.011*"#resistance" + 0.009*"amp"
Topic: 1 
Words: 0.064*"#resist" + 0.044*"#resistance" + 0.027*"#theresistance" + 0.019*"#trumprussia" + 0.015*"#bluewave2018" + 0.014*"@realdonaldtrump" + 0.013*"#democrats" + 0.011*"#impeachtrump" + 0.011*"#maga" + 0.010*"trump"
Topic: 2 
Words: 0.058*"#resist" + 0.029*"@realdonaldtrump" + 0.023*"trump" + 0.019*"#democrats" + 0.012*"#bluewave2018" + 0.007*"vote" + 0.007*"time" + 0.006*"dont" + 0.006*"#bluewave" + 0.006*"lie"
Topic: 3 
Words: 0.039*"#resist" + 0.030*"#resistance" + 0.024*"#bluewave2018" + 0.019*"amp" + 0.014*"follow" + 0.013*"#voteblue" + 0.012*"#democrats" + 0.010*"#fbr" + 0.010*"trump" + 0.009*"#impeachtrump"
Topic: 4 
Words: 0.103*"#resist" + 0.022*"amp" + 0.016*"#maga" + 0.015*"trump" + 0.014*"open" + 0.012*"letter" + 0.012*"#voteblue" + 0.012*"@realdonaldtrump" + 0

In [11]:
for idx, topic in dem_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"#resist" + 0.009*"#democrats" + 0.008*"poll" + 0.006*"#voteblue" + 0.006*"country" + 0.006*"thank" + 0.006*"vote" + 0.006*"trump" + 0.006*"#bluewave" + 0.005*"must"
Topic: 1 
Words: 0.085*"#resist" + 0.013*"#resistbot" + 0.013*"letter" + 0.013*"open" + 0.012*"#bluewave2018" + 0.009*"#bluewave" + 0.008*"@realdonaldtrump" + 0.007*"get" + 0.007*"#resistência" + 0.007*"#resistance"
Topic: 2 
Words: 0.014*"vote" + 0.012*"#resist" + 0.011*"#trumpresign" + 0.010*"#resistance" + 0.009*"#democrats" + 0.008*"#trump" + 0.007*"#bluewave" + 0.007*"@realdonaldtrump" + 0.006*"amp" + 0.006*"#bluewave2018"
Topic: 3 
Words: 0.018*"#democrats" + 0.011*"#resistance" + 0.010*"#bluewave" + 0.010*"#resist" + 0.009*"please" + 0.008*"@realdonaldtrump" + 0.007*"vote" + 0.007*"#voteblue" + 0.007*"#walkaway" + 0.007*"via"
Topic: 4 
Words: 0.017*"#votethemout" + 0.016*"#voteblue" + 0.013*"#resist" + 0.010*"#maga" + 0.010*"@realdonaldtrump" + 0.009*"#resistance" + 0.009*"yes" + 0.009*"#fbr" 

## Republicans

### Preparation

In [12]:
rep_tweets = pd.DataFrame(db.read(data_query.format("republican")))
rep_docs = [ast.literal_eval(doc) for doc in  rep_tweets[0].tolist()]
rep_dict = corpora.Dictionary(dem_docs)

In [13]:
# Adjust no_above to filter frequently occuring words
rep_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)

In [14]:
rep_bow_corpus = [rep_dict.doc2bow(doc) for doc in rep_docs]

### Top Word Counts

In [15]:
rep_counts = get_word_counts(rep_bow_corpus)
print_word_counts(rep_counts, 50)

#maga, 5080 times
#trump, 2742 times
@realdonaldtrump, 1453 times
trump, 1035 times
amp, 925 times
get, 627 times
@potus, 575 times
#gop, 570 times
#qanon, 555 times
go, 530 times
president, 489 times
say, 474 times
people, 450 times
like, 444 times
know, 430 times
vote, 422 times
make, 413 times
follow, 403 times
one, 383 times
dont, 366 times
#kag, 359 times
america, 356 times
time, 353 times
want, 332 times
need, 319 times
back, 316 times
think, 305 times
take, 292 times
would, 292 times
see, 283 times
#trumptrain, 281 times
great, 277 times
good, 276 times
american, 276 times
#wwg1wga, 266 times
right, 259 times
country, 257 times
keep, 242 times
im, 240 times
come, 235 times
let, 230 times
de, 227 times
lie, 222 times
via, 219 times
thank, 215 times
work, 212 times
#walkaway, 211 times
please, 211 times
#tcot, 210 times
well, 209 times


### Train Models

In [16]:
rep_tfidf = models.TfidfModel(rep_bow_corpus)
rep_corpus_tfidf = rep_tfidf[rep_bow_corpus]

In [17]:
rep_lda_model = models.LdaMulticore(rep_bow_corpus, num_topics=10, id2word=rep_dict, passes=2, workers=2)
rep_lda_model_tfidf = models.LdaMulticore(rep_corpus_tfidf, num_topics=10, id2word=rep_dict, passes=2, workers=4)

### Print Categories

In [18]:
for idx, topic in rep_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.137*"#maga" + 0.035*"@realdonaldtrump" + 0.028*"follow" + 0.020*"#qanon" + 0.016*"#votered" + 0.015*"vote" + 0.012*"back" + 0.012*"#kag" + 0.011*"#greatawakening" + 0.010*"patriot"
Topic: 1 
Words: 0.060*"#maga" + 0.024*"amp" + 0.021*"make" + 0.021*"#trump" + 0.016*"@realdonaldtrump" + 0.013*"lie" + 0.010*"trump" + 0.009*"get" + 0.008*"great" + 0.008*"time"
Topic: 2 
Words: 0.066*"#maga" + 0.029*"#trump" + 0.016*"trump" + 0.015*"america" + 0.011*"amp" + 0.011*"come" + 0.010*"keep" + 0.009*"know" + 0.009*"@realdonaldtrump" + 0.008*"would"
Topic: 3 
Words: 0.063*"#maga" + 0.023*"amp" + 0.022*"#trump" + 0.022*"get" + 0.014*"@realdonaldtrump" + 0.013*"want" + 0.010*"let" + 0.010*"follow" + 0.009*"trump" + 0.009*"say"
Topic: 4 
Words: 0.057*"#trump" + 0.028*"#maga" + 0.026*"amp" + 0.017*"trump" + 0.016*"#gop" + 0.015*"people" + 0.014*"take" + 0.014*"get" + 0.011*"one" + 0.008*"news"
Topic: 5 
Words: 0.095*"#trump" + 0.048*"#maga" + 0.029*"@realdonaldtrump" + 0.025*"de" + 

In [19]:
for idx, topic in rep_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.036*"#maga" + 0.014*"#trump" + 0.013*"right" + 0.010*"dont" + 0.009*"trump" + 0.009*"@realdonaldtrump" + 0.009*"via" + 0.008*"im" + 0.007*"#americafirst" + 0.007*"watch"
Topic: 1 
Words: 0.040*"#trump" + 0.028*"#maga" + 0.022*"#kag" + 0.012*"@realdonaldtrump" + 0.009*"#buildthewall" + 0.008*"amp" + 0.008*"#trumptrain" + 0.007*"get" + 0.007*"#draintheswamp" + 0.007*"president"
Topic: 2 
Words: 0.034*"#trump" + 0.023*"de" + 0.012*"trump" + 0.010*"que" + 0.009*"#news" + 0.009*"la" + 0.008*"#maga" + 0.007*"think" + 0.007*"#gop" + 0.006*"one"
Topic: 3 
Words: 0.017*"#maga" + 0.015*"#trump" + 0.014*"@realdonaldtrump" + 0.012*"#wwg1wga" + 0.010*"#walkaway" + 0.010*"la" + 0.009*"#qanon" + 0.009*"#kag" + 0.008*"@potus" + 0.008*"job"
Topic: 4 
Words: 0.027*"#maga" + 0.022*"@realdonaldtrump" + 0.017*"#trumptrain" + 0.011*"#trump" + 0.010*"@foxnews" + 0.010*"trump" + 0.009*"amp" + 0.008*"get" + 0.008*"de" + 0.007*"#redwave"
Topic: 5 
Words: 0.046*"#trump" + 0.012*"#gop" + 0.009*

# House

In [20]:
house_tweets = pd.DataFrame(db.read(data_query.format("house")))
house_docs = [ast.literal_eval(doc) for doc in  house_tweets[0].tolist()]
house_dict = corpora.Dictionary(house_docs)

In [21]:
house_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
house_bow_corpus = [house_dict.doc2bow(doc) for doc in house_docs]

### Top Word Counts

In [22]:
house_counts = get_word_counts(house_bow_corpus)
print_word_counts(house_counts, 50)

@senategop, 1243 times
win, 874 times
@jimjordan, 822 times
line, 821 times
session, 802 times
@donaldjtrumpjr, 796 times
@realdonaldtrump, 754 times
#trump, 748 times
many, 744 times
mean, 671 times
#trumpcrimefamily, 670 times
let, 646 times
problem, 599 times
new, 585 times
social, 567 times
#winblue, 549 times
create, 539 times
con, 532 times
#impeachtrump, 528 times
old, 513 times
wrong, 499 times
@sensanders, 475 times
resignation, 474 times
job, 464 times
tonight, 436 times
dont, 433 times
#theresistance, 430 times
thing, 425 times
july, 424 times
bring, 422 times
#women, 416 times
reach, 401 times
complicit, 394 times
trump, 394 times
volunteer, 382 times
@gop, 378 times
hope, 377 times
guess, 375 times
#liarinchief, 366 times
#democrats, 359 times
chance, 344 times
trust, 335 times
follow, 332 times
give, 332 times
even, 330 times
@repadamschiff, 324 times
#p2, 318 times
reply, 310 times
really, 310 times
thank, 306 times


### Train Models

In [23]:
house_tfidf = models.TfidfModel(house_bow_corpus)
house_corpus_tfidf = house_tfidf[house_bow_corpus]

In [24]:
house_lda_model = models.LdaMulticore(house_bow_corpus, num_topics=10, id2word=house_dict, passes=2, workers=2)
house_lda_model_tfidf = models.LdaMulticore(house_corpus_tfidf, num_topics=10, id2word=house_dict, passes=2, workers=4)

### Print Categories

In [25]:
for idx, topic in house_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.021*"amp" + 0.014*"work" + 0.013*"today" + 0.009*"good" + 0.009*"make" + 0.008*"time" + 0.008*"one" + 0.008*"great" + 0.008*"tax" + 0.007*"call"
Topic: 1 
Words: 0.022*"amp" + 0.020*"people" + 0.013*"need" + 0.011*"go" + 0.011*"im" + 0.011*"family" + 0.008*"work" + 0.008*"campaign" + 0.008*"must" + 0.008*"continue"
Topic: 2 
Words: 0.019*"need" + 0.014*"get" + 0.011*"time" + 0.011*"day" + 0.011*"election" + 0.009*"today" + 0.009*"trump" + 0.008*"thank" + 0.008*"make" + 0.008*"work"
Topic: 3 
Words: 0.034*"thank" + 0.020*"vote" + 0.011*"help" + 0.010*"today" + 0.010*"get" + 0.009*"day" + 0.009*"support" + 0.008*"take" + 0.008*"go" + 0.008*"time"
Topic: 4 
Words: 0.021*"great" + 0.017*"work" + 0.010*"thanks" + 0.010*"get" + 0.009*"right" + 0.009*"support" + 0.008*"american" + 0.007*"president" + 0.007*"amp" + 0.007*"woman"
Topic: 5 
Words: 0.013*"happy" + 0.010*"day" + 0.008*"make" + 0.008*"vote" + 0.008*"get" + 0.007*"district" + 0.007*"today" + 0.007*"president" + 0.

In [26]:
for idx, topic in house_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.014*"thank" + 0.006*"year" + 0.005*"great" + 0.005*"love" + 0.005*"proud" + 0.005*"morning" + 0.005*"community" + 0.005*"go" + 0.004*"today" + 0.004*"say"
Topic: 1 
Words: 0.008*"get" + 0.007*"make" + 0.007*"vote" + 0.006*"sure" + 0.006*"happy" + 0.005*"new" + 0.005*"right" + 0.005*"go" + 0.005*"thank" + 0.004*"great"
Topic: 2 
Words: 0.016*"thanks" + 0.007*"get" + 0.006*"need" + 0.006*"amp" + 0.005*"president" + 0.005*"help" + 0.005*"great" + 0.005*"family" + 0.005*"never" + 0.004*"state"
Topic: 3 
Words: 0.009*"thank" + 0.006*"get" + 0.006*"support" + 0.006*"people" + 0.006*"like" + 0.006*"let" + 0.005*"much" + 0.005*"join" + 0.005*"work" + 0.005*"woman"
Topic: 4 
Words: 0.007*"@realdonaldtrump" + 0.007*"year" + 0.006*"right" + 0.006*"im" + 0.005*"make" + 0.005*"amp" + 0.005*"vote" + 0.005*"read" + 0.005*"last" + 0.004*"congress"
Topic: 5 
Words: 0.034*"thank" + 0.008*"today" + 0.008*"support" + 0.007*"good" + 0.007*"vote" + 0.006*"family" + 0.006*"work" + 0.005*"a