# Preliminary "Base" Topic Modeling
Citation:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
from src.db_client import DBClient
import pandas as pd
from gensim import corpora, models
import ast

In [2]:
db = DBClient(secrets_path = "configs/db_secrets.json")

Connected to political tweets DB


In [3]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 10000)
select tweet_text_clean 
from random_tweets;
"""

In [75]:
tax_query = """
select tweet_text_clean from staging.{}
where tweet_date between '2018-01-01' and '2019-01-01'
and tweet_text_clean like '%health%'
limit 10000  
"""

In [9]:
def get_word_counts(bow_corpus):
    counts = {}
    for bow in bow_corpus:
        for word in bow:
            if word[0] not in counts.keys():
                counts[word[0]] = 0
            counts[word[0]] += word[1]
    return [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]

def print_word_counts(word_counts, num_words, word_dict):
    for tup in word_counts[0:num_words]:
        print(f"{word_dict[tup[0]]}, {tup[1]} times")

## Democrats

In [76]:
dem_tax_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))

In [77]:
dem_tax_tweets

Unnamed: 0,0
0,"['omg', '#barbaric', 'amp', '#fiendish', '#cor..."
1,"['@housedemocrats', '@gop', 'go', 'stop', 'pot..."
2,"['@tedlieu', '@speakerryan', 'might', 'nice', ..."
3,"['#democraticagenda', 'affordable', 'healthcar..."
4,"['@foxnews', '@gregabbotttx', 'stop', 'take', ..."
5,"['@pelucachick46', '@tomperez', '@nancypelosi'..."
6,"['healthcare', 'immigration', 'tax', 'reform',..."
7,"['@realdonaldtrump', '@momsdemand', '@corapunz..."
8,"['@realdonaldtrump', 'hey', '#oh12', '#gotv', ..."
9,"['im', 'sure', 'im', 'happy', 'result', 'study..."


### Preparation

In [78]:
dem_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [100]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [101]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50)

healthcare, 355 times
#voteblue, 310 times
health, 283 times
vote, 258 times
amp, 256 times
#resist, 248 times
#healthcare, 202 times
care, 183 times
#bluewave, 172 times
#democrats, 135 times
take, 121 times
#bluewave2018, 104 times
right, 99 times
need, 97 times
get, 96 times
#resistance, 94 times
away, 85 times
people, 84 times
want, 83 times
@realdonaldtrump, 81 times
trump, 80 times
like, 74 times
dont, 73 times
@gop, 72 times
go, 69 times
american, 66 times
gop, 61 times
#trump, 57 times
condition, 56 times
make, 55 times
affordable, 54 times
tax, 54 times
pay, 54 times
save, 53 times
education, 51 times
issue, 50 times
#maga, 49 times
support, 49 times
woman, 49 times
republican, 49 times
#vote, 48 times
security, 48 times
social, 48 times
time, 46 times
good, 46 times
lie, 46 times
#gop, 43 times
let, 43 times
child, 43 times
work, 42 times


### Train Models

In [None]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [None]:
dem_lda_model = models.LdaMulticore(dem_bow_corpus, num_topics=10, id2word=dem_dict, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf, num_topics=10, id2word=dem_dict, passes=2, workers=4)

### Print Categories

In [104]:
for idx, topic in dem_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.099*"#voteblue" + 0.081*"healthcare" + 0.033*"save" + 0.028*"right" + 0.028*"security" + 0.027*"vote" + 0.027*"medicare" + 0.026*"social" + 0.020*"republican" + 0.020*"want"
Topic: 1 
Words: 0.066*"health" + 0.058*"care" + 0.049*"#voteblue" + 0.029*"american" + 0.025*"healthcare" + 0.025*"#bluewave" + 0.023*"amp" + 0.020*"right" + 0.020*"need" + 0.020*"people"
Topic: 2 
Words: 0.118*"#resist" + 0.062*"#healthcare" + 0.042*"amp" + 0.041*"#resistance" + 0.040*"#trump" + 0.032*"#maga" + 0.030*"#health" + 0.028*"#democrats" + 0.024*"trump" + 0.022*"#foxnews"
Topic: 3 
Words: 0.073*"health" + 0.038*"care" + 0.034*"#voteblue" + 0.029*"woman" + 0.028*"#bluewave2018" + 0.025*"take" + 0.025*"want" + 0.022*"#resist" + 0.022*"amp" + 0.020*"need"
Topic: 4 
Words: 0.067*"healthcare" + 0.051*"#resist" + 0.039*"#voteblue" + 0.032*"gop" + 0.027*"vote" + 0.025*"amp" + 0.024*"must" + 0.022*"dont" + 0.022*"condition" + 0.021*"#bluewave2018"
Topic: 5 
Words: 0.043*"#healthcare" + 0.033*

In [105]:
for idx, topic in dem_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.045*"#voteblue2018" + 0.039*"#bluewave2018" + 0.037*"#healthcarevoter" + 0.025*"#democrats" + 0.023*"#atomicveterans" + 0.023*"#enewetak" + 0.023*"parity" + 0.023*"#gop" + 0.022*"#potus" + 0.021*"#healthcare"
Topic: 1 
Words: 0.024*"healthy" + 0.024*"healthcare" + 0.021*"right" + 0.020*"protect" + 0.019*"people" + 0.019*"#voteblue" + 0.019*"take" + 0.017*"amp" + 0.016*"#resist" + 0.016*"kill"
Topic: 2 
Words: 0.041*"#democrats" + 0.027*"#healthcare" + 0.025*"#voteblue" + 0.025*"healthcare" + 0.024*"#votebluetosaveamerica" + 0.020*"care" + 0.020*"democrat" + 0.019*"need" + 0.018*"must" + 0.016*"like"
Topic: 3 
Words: 0.038*"#voteblue" + 0.030*"healthcare" + 0.019*"woman" + 0.018*"amp" + 0.016*"#womenshealth" + 0.016*"country" + 0.015*"#metoo" + 0.015*"#flipitblue" + 0.015*"#resist" + 0.015*"issue"
Topic: 4 
Words: 0.058*"#resist" + 0.032*"#health" + 0.029*"health" + 0.025*"care" + 0.024*"#bluewave" + 0.022*"#healthcare" + 0.020*"#voteblue" + 0.019*"#maga" + 0.018*"#re

## Republicans

### Preparation

In [106]:
rep_tweets = pd.DataFrame(db.read(tax_query.format("republican")))
rep_docs = [ast.literal_eval(doc) for doc in  rep_tweets[0].tolist()]
rep_dict = corpora.Dictionary(dem_docs)

In [107]:
# Adjust no_above to filter frequently occuring words
rep_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)

In [108]:
rep_bow_corpus = [rep_dict.doc2bow(doc) for doc in rep_docs]

### Top Word Counts

In [109]:
rep_counts = get_word_counts(rep_bow_corpus)
print_word_counts(rep_counts, 50)

#maga, 277 times
health, 243 times
#trump, 201 times
amp, 169 times
healthcare, 129 times
#gop, 118 times
#healthcare, 104 times
care, 85 times
@realdonaldtrump, 84 times
trump, 68 times
mental, 67 times
get, 59 times
people, 58 times
make, 56 times
dont, 53 times
need, 52 times
vote, 47 times
want, 45 times
tax, 45 times
take, 44 times
go, 42 times
gun, 40 times
#health, 39 times
insurance, 38 times
give, 37 times
right, 36 times
like, 36 times
america, 35 times
back, 34 times
good, 34 times
cut, 33 times
pay, 33 times
issue, 32 times
say, 32 times
would, 32 times
@gop, 30 times
healthy, 29 times
american, 29 times
child, 28 times
one, 28 times
let, 28 times
please, 27 times
know, 27 times
work, 26 times
country, 25 times
talk, 24 times
stop, 24 times
job, 24 times
time, 23 times
try, 23 times


### Train Models

In [110]:
rep_tfidf = models.TfidfModel(rep_bow_corpus)
rep_corpus_tfidf = rep_tfidf[rep_bow_corpus]

In [111]:
rep_lda_model = models.LdaMulticore(rep_bow_corpus, num_topics=10, id2word=rep_dict, passes=2, workers=2)
rep_lda_model_tfidf = models.LdaMulticore(rep_corpus_tfidf, num_topics=10, id2word=rep_dict, passes=2, workers=4)

### Print Categories

In [112]:
for idx, topic in rep_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.087*"#trump" + 0.077*"health" + 0.058*"mental" + 0.057*"#maga" + 0.049*"amp" + 0.035*"#health" + 0.033*"gun" + 0.031*"talk" + 0.022*"trump" + 0.019*"well"
Topic: 1 
Words: 0.111*"health" + 0.097*"#maga" + 0.033*"make" + 0.029*"mental" + 0.028*"get" + 0.023*"insurance" + 0.023*"amp" + 0.021*"@realdonaldtrump" + 0.021*"#gop" + 0.021*"dont"
Topic: 2 
Words: 0.187*"#maga" + 0.067*"healthcare" + 0.043*"like" + 0.038*"back" + 0.030*"need" + 0.030*"@realdonaldtrump" + 0.026*"#healthcare" + 0.021*"vote" + 0.019*"make" + 0.018*"trump"
Topic: 3 
Words: 0.065*"amp" + 0.062*"#maga" + 0.057*"health" + 0.041*"#gop" + 0.030*"care" + 0.030*"@realdonaldtrump" + 0.028*"healthcare" + 0.028*"healthy" + 0.021*"tax" + 0.019*"give"
Topic: 4 
Words: 0.079*"amp" + 0.062*"#healthcare" + 0.060*"#trump" + 0.033*"#gop" + 0.023*"take" + 0.023*"#medicare" + 0.022*"vote" + 0.021*"healthcare" + 0.018*"#socialsecurity" + 0.018*"#aca"
Topic: 5 
Words: 0.080*"healthcare" + 0.066*"#trump" + 0.051*"#gop"

In [113]:
for idx, topic in rep_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.102*"#health" + 0.036*"#trump" + 0.032*"#maga" + 0.031*"need" + 0.027*"want" + 0.026*"people" + 0.026*"think" + 0.023*"#education" + 0.022*"#environment" + 0.020*"state"
Topic: 1 
Words: 0.070*"mental" + 0.046*"health" + 0.028*"#trump" + 0.027*"gun" + 0.027*"trump" + 0.023*"say" + 0.022*"tax" + 0.020*"country" + 0.018*"people" + 0.018*"care"
Topic: 2 
Words: 0.044*"#mentalhealth" + 0.031*"#trump" + 0.025*"want" + 0.024*"via" + 0.024*"health" + 0.023*"amp" + 0.022*"#obamacare" + 0.022*"#gop" + 0.021*"talk" + 0.020*"#healthcare"
Topic: 3 
Words: 0.071*"#maga" + 0.057*"#trump" + 0.049*"@realdonaldtrump" + 0.048*"back" + 0.032*"trump" + 0.025*"get" + 0.023*"health" + 0.022*"vote" + 0.022*"#potus" + 0.021*"#healthcare"
Topic: 4 
Words: 0.043*"care" + 0.038*"would" + 0.037*"thanks" + 0.029*"work" + 0.028*"healthcare" + 0.028*"#maga" + 0.026*"control" + 0.026*"make" + 0.025*"health" + 0.023*"people"
Topic: 5 
Words: 0.049*"#healthcare" + 0.034*"amp" + 0.030*"#trump" + 0.027

# House

In [93]:
house_tweets = pd.DataFrame(db.read(tax_query.format("house")))
house_docs = [ast.literal_eval(doc) for doc in  house_tweets[0].tolist()]
house_dict = corpora.Dictionary(house_docs)

In [94]:
house_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
house_bow_corpus = [house_dict.doc2bow(doc) for doc in house_docs]

### Top Word Counts

In [95]:
house_counts = get_word_counts(house_bow_corpus)
print_word_counts(house_counts, 50)

save, 5470 times
amp, 3821 times
stop, 3043 times
family, 2365 times
#trump, 1363 times
protect, 1330 times
#vote, 1228 times
voter, 1154 times
healthcare, 1122 times
#gotv, 1106 times
#immigration, 1086 times


KeyError: 336

### Train Models

In [96]:
house_tfidf = models.TfidfModel(house_bow_corpus)
house_corpus_tfidf = house_tfidf[house_bow_corpus]

In [97]:
house_lda_model = models.LdaMulticore(house_bow_corpus, num_topics=10, id2word=house_dict, passes=2, workers=2)
house_lda_model_tfidf = models.LdaMulticore(house_corpus_tfidf, num_topics=10, id2word=house_dict, passes=2, workers=4)

### Print Categories

In [98]:
for idx, topic in house_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"healthcare" + 0.032*"amp" + 0.018*"health" + 0.017*"woman" + 0.013*"people" + 0.011*"family" + 0.011*"stand" + 0.010*"right" + 0.009*"care" + 0.009*"insurance"
Topic: 1 
Words: 0.030*"healthcare" + 0.021*"health" + 0.017*"people" + 0.013*"care" + 0.012*"work" + 0.011*"get" + 0.010*"congress" + 0.010*"vote" + 0.008*"take" + 0.008*"want"
Topic: 2 
Words: 0.066*"health" + 0.034*"care" + 0.014*"need" + 0.013*"insurance" + 0.011*"affordable" + 0.011*"american" + 0.009*"healthcare" + 0.009*"trump" + 0.008*"people" + 0.008*"coverage"
Topic: 3 
Words: 0.028*"health" + 0.027*"healthcare" + 0.019*"care" + 0.011*"american" + 0.011*"fight" + 0.009*"family" + 0.009*"amp" + 0.009*"affordable" + 0.008*"job" + 0.008*"make"
Topic: 4 
Words: 0.023*"health" + 0.020*"amp" + 0.020*"care" + 0.017*"healthcare" + 0.016*"vote" + 0.010*"support" + 0.010*"access" + 0.009*"healthy" + 0.009*"take" + 0.009*"help"
Topic: 5 
Words: 0.031*"health" + 0.023*"healthcare" + 0.017*"right" + 0.017*"c

In [99]:
for idx, topic in house_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"healthcare" + 0.006*"right" + 0.006*"health" + 0.006*"care" + 0.005*"woman" + 0.005*"get" + 0.005*"amp" + 0.004*"affordable" + 0.004*"people" + 0.004*"insurance"
Topic: 1 
Words: 0.008*"healthcare" + 0.007*"care" + 0.006*"make" + 0.006*"amp" + 0.006*"health" + 0.006*"vote" + 0.006*"work" + 0.005*"support" + 0.005*"congress" + 0.005*"affordable"
Topic: 2 
Words: 0.007*"care" + 0.007*"health" + 0.006*"amp" + 0.006*"healthcare" + 0.005*"work" + 0.005*"american" + 0.005*"woman" + 0.005*"fight" + 0.004*"need" + 0.004*"vote"
Topic: 3 
Words: 0.008*"healthcare" + 0.007*"care" + 0.006*"health" + 0.006*"amp" + 0.005*"condition" + 0.005*"work" + 0.005*"family" + 0.005*"need" + 0.005*"support" + 0.005*"american"
Topic: 4 
Words: 0.007*"amp" + 0.006*"healthcare" + 0.006*"care" + 0.006*"health" + 0.005*"vote" + 0.005*"insurance" + 0.004*"need" + 0.004*"make" + 0.004*"everyone" + 0.004*"work"
Topic: 5 
Words: 0.006*"amp" + 0.006*"healthcare" + 0.006*"health" + 0.005*"care" + 

## Democrat Cleaning Tests

In [6]:
dem_tweets_select = pd.DataFrame(db.read(data_query.format("democrat_select")))
dem_docs_select = [ast.literal_eval(doc) for doc in  dem_tweets_select[0].tolist()]
dem_dict_select = corpora.Dictionary(dem_docs_select)

In [7]:
# Alter no_above to filter out frequently occuring words
dem_dict_select.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus_select = [dem_dict_select.doc2bow(doc) for doc in dem_docs_select]

In [10]:
dem_counts_select = get_word_counts(dem_bow_corpus_select)
print_word_counts(dem_counts_select, 50, dem_dict_select)

@realdonaldtrump, 1057 times
vote, 977 times
trump, 940 times
amp, 915 times
get, 730 times
#theresistance, 602 times
go, 587 times
like, 545 times
people, 505 times
#maga, 494 times
need, 469 times
make, 443 times
#votethemout, 421 times
#trump, 418 times
dont, 411 times
say, 391 times
time, 381 times
follow, 381 times
#fbr, 364 times
know, 360 times
one, 343 times
want, 336 times
take, 327 times
let, 323 times
right, 323 times
#impeachtrump, 318 times
back, 316 times
good, 299 times
america, 292 times
think, 285 times
democrat, 277 times
would, 275 times
im, 269 times
@gop, 268 times
see, 267 times
come, 266 times
#vote, 262 times
country, 262 times
please, 256 times
day, 250 times
republican, 245 times
american, 239 times
via, 237 times
#trumprussia, 236 times
party, 235 times
work, 234 times
must, 233 times
help, 230 times
election, 227 times
support, 227 times


In [14]:
dem_tfidf_select = models.TfidfModel(dem_bow_corpus_select)
dem_corpus_tfidf_select = dem_tfidf_select[dem_bow_corpus_select]
dem_lda_model = models.LdaMulticore(dem_bow_corpus_select, num_topics=10, id2word=dem_dict_select, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf_select, num_topics=10, id2word=dem_dict_select, passes=2, workers=4)

