# Preliminary "Base" Topic Modeling
Citation:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
from src.db_client import DBClient
import pandas as pd
from gensim import corpora, models
import ast

  """)


In [2]:
db = DBClient(secrets_path = "configs/db_secrets.json")

Connected to political tweets DB


In [3]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 10000)
select tweet_text_clean 
from random_tweets;
"""

In [75]:
tax_query = """
select tweet_text_clean from staging.{}
where tweet_date between '2018-01-01' and '2019-01-01'
and tweet_text_clean like '%health%'
limit 10000  
"""

In [3]:
def get_word_counts(bow_corpus):
    counts = {}
    for bow in bow_corpus:
        for word in bow:
            if word[0] not in counts.keys():
                counts[word[0]] = 0
            counts[word[0]] += word[1]
    return [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]

def print_word_counts(word_counts, num_words):
    for tup in word_counts[0:num_words]:
        print(f"{dem_dict[tup[0]]}, {tup[1]} times")

## Democrats

In [76]:
dem_tax_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))

In [77]:
dem_tax_tweets

Unnamed: 0,0
0,"['omg', '#barbaric', 'amp', '#fiendish', '#cor..."
1,"['@housedemocrats', '@gop', 'go', 'stop', 'pot..."
2,"['@tedlieu', '@speakerryan', 'might', 'nice', ..."
3,"['#democraticagenda', 'affordable', 'healthcar..."
4,"['@foxnews', '@gregabbotttx', 'stop', 'take', ..."
5,"['@pelucachick46', '@tomperez', '@nancypelosi'..."
6,"['healthcare', 'immigration', 'tax', 'reform',..."
7,"['@realdonaldtrump', '@momsdemand', '@corapunz..."
8,"['@realdonaldtrump', 'hey', '#oh12', '#gotv', ..."
9,"['im', 'sure', 'im', 'happy', 'result', 'study..."


### Preparation

In [None]:
dem_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [None]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [None]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50)

### Train Models

In [None]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [None]:
dem_lda_model = models.LdaMulticore(dem_bow_corpus, num_topics=10, id2word=dem_dict, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf, num_topics=10, id2word=dem_dict, passes=2, workers=4)

### Print Categories

In [None]:
for idx, topic in dem_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [105]:
for idx, topic in dem_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.045*"#voteblue2018" + 0.039*"#bluewave2018" + 0.037*"#healthcarevoter" + 0.025*"#democrats" + 0.023*"#atomicveterans" + 0.023*"#enewetak" + 0.023*"parity" + 0.023*"#gop" + 0.022*"#potus" + 0.021*"#healthcare"
Topic: 1 
Words: 0.024*"healthy" + 0.024*"healthcare" + 0.021*"right" + 0.020*"protect" + 0.019*"people" + 0.019*"#voteblue" + 0.019*"take" + 0.017*"amp" + 0.016*"#resist" + 0.016*"kill"
Topic: 2 
Words: 0.041*"#democrats" + 0.027*"#healthcare" + 0.025*"#voteblue" + 0.025*"healthcare" + 0.024*"#votebluetosaveamerica" + 0.020*"care" + 0.020*"democrat" + 0.019*"need" + 0.018*"must" + 0.016*"like"
Topic: 3 
Words: 0.038*"#voteblue" + 0.030*"healthcare" + 0.019*"woman" + 0.018*"amp" + 0.016*"#womenshealth" + 0.016*"country" + 0.015*"#metoo" + 0.015*"#flipitblue" + 0.015*"#resist" + 0.015*"issue"
Topic: 4 
Words: 0.058*"#resist" + 0.032*"#health" + 0.029*"health" + 0.025*"care" + 0.024*"#bluewave" + 0.022*"#healthcare" + 0.020*"#voteblue" + 0.019*"#maga" + 0.018*"#re

## Republicans

### Preparation

In [106]:
rep_tweets = pd.DataFrame(db.read(tax_query.format("republican")))
rep_docs = [ast.literal_eval(doc) for doc in  rep_tweets[0].tolist()]
rep_dict = corpora.Dictionary(dem_docs)

In [107]:
# Adjust no_above to filter frequently occuring words
rep_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)

In [108]:
rep_bow_corpus = [rep_dict.doc2bow(doc) for doc in rep_docs]

### Top Word Counts

In [109]:
rep_counts = get_word_counts(rep_bow_corpus)
print_word_counts(rep_counts, 50)

#maga, 277 times
health, 243 times
#trump, 201 times
amp, 169 times
healthcare, 129 times
#gop, 118 times
#healthcare, 104 times
care, 85 times
@realdonaldtrump, 84 times
trump, 68 times
mental, 67 times
get, 59 times
people, 58 times
make, 56 times
dont, 53 times
need, 52 times
vote, 47 times
want, 45 times
tax, 45 times
take, 44 times
go, 42 times
gun, 40 times
#health, 39 times
insurance, 38 times
give, 37 times
right, 36 times
like, 36 times
america, 35 times
back, 34 times
good, 34 times
cut, 33 times
pay, 33 times
issue, 32 times
say, 32 times
would, 32 times
@gop, 30 times
healthy, 29 times
american, 29 times
child, 28 times
one, 28 times
let, 28 times
please, 27 times
know, 27 times
work, 26 times
country, 25 times
talk, 24 times
stop, 24 times
job, 24 times
time, 23 times
try, 23 times


### Train Models

In [110]:
rep_tfidf = models.TfidfModel(rep_bow_corpus)
rep_corpus_tfidf = rep_tfidf[rep_bow_corpus]

In [111]:
rep_lda_model = models.LdaMulticore(rep_bow_corpus, num_topics=10, id2word=rep_dict, passes=2, workers=2)
rep_lda_model_tfidf = models.LdaMulticore(rep_corpus_tfidf, num_topics=10, id2word=rep_dict, passes=2, workers=4)

### Print Categories

In [112]:
for idx, topic in rep_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.087*"#trump" + 0.077*"health" + 0.058*"mental" + 0.057*"#maga" + 0.049*"amp" + 0.035*"#health" + 0.033*"gun" + 0.031*"talk" + 0.022*"trump" + 0.019*"well"
Topic: 1 
Words: 0.111*"health" + 0.097*"#maga" + 0.033*"make" + 0.029*"mental" + 0.028*"get" + 0.023*"insurance" + 0.023*"amp" + 0.021*"@realdonaldtrump" + 0.021*"#gop" + 0.021*"dont"
Topic: 2 
Words: 0.187*"#maga" + 0.067*"healthcare" + 0.043*"like" + 0.038*"back" + 0.030*"need" + 0.030*"@realdonaldtrump" + 0.026*"#healthcare" + 0.021*"vote" + 0.019*"make" + 0.018*"trump"
Topic: 3 
Words: 0.065*"amp" + 0.062*"#maga" + 0.057*"health" + 0.041*"#gop" + 0.030*"care" + 0.030*"@realdonaldtrump" + 0.028*"healthcare" + 0.028*"healthy" + 0.021*"tax" + 0.019*"give"
Topic: 4 
Words: 0.079*"amp" + 0.062*"#healthcare" + 0.060*"#trump" + 0.033*"#gop" + 0.023*"take" + 0.023*"#medicare" + 0.022*"vote" + 0.021*"healthcare" + 0.018*"#socialsecurity" + 0.018*"#aca"
Topic: 5 
Words: 0.080*"healthcare" + 0.066*"#trump" + 0.051*"#gop"

In [113]:
for idx, topic in rep_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.102*"#health" + 0.036*"#trump" + 0.032*"#maga" + 0.031*"need" + 0.027*"want" + 0.026*"people" + 0.026*"think" + 0.023*"#education" + 0.022*"#environment" + 0.020*"state"
Topic: 1 
Words: 0.070*"mental" + 0.046*"health" + 0.028*"#trump" + 0.027*"gun" + 0.027*"trump" + 0.023*"say" + 0.022*"tax" + 0.020*"country" + 0.018*"people" + 0.018*"care"
Topic: 2 
Words: 0.044*"#mentalhealth" + 0.031*"#trump" + 0.025*"want" + 0.024*"via" + 0.024*"health" + 0.023*"amp" + 0.022*"#obamacare" + 0.022*"#gop" + 0.021*"talk" + 0.020*"#healthcare"
Topic: 3 
Words: 0.071*"#maga" + 0.057*"#trump" + 0.049*"@realdonaldtrump" + 0.048*"back" + 0.032*"trump" + 0.025*"get" + 0.023*"health" + 0.022*"vote" + 0.022*"#potus" + 0.021*"#healthcare"
Topic: 4 
Words: 0.043*"care" + 0.038*"would" + 0.037*"thanks" + 0.029*"work" + 0.028*"healthcare" + 0.028*"#maga" + 0.026*"control" + 0.026*"make" + 0.025*"health" + 0.023*"people"
Topic: 5 
Words: 0.049*"#healthcare" + 0.034*"amp" + 0.030*"#trump" + 0.027

# House

In [93]:
house_tweets = pd.DataFrame(db.read(tax_query.format("house")))
house_docs = [ast.literal_eval(doc) for doc in  house_tweets[0].tolist()]
house_dict = corpora.Dictionary(house_docs)

In [94]:
house_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
house_bow_corpus = [house_dict.doc2bow(doc) for doc in house_docs]

### Top Word Counts

In [95]:
house_counts = get_word_counts(house_bow_corpus)
print_word_counts(house_counts, 50)

save, 5470 times
amp, 3821 times
stop, 3043 times
family, 2365 times
#trump, 1363 times
protect, 1330 times
#vote, 1228 times
voter, 1154 times
healthcare, 1122 times
#gotv, 1106 times
#immigration, 1086 times


KeyError: 336

### Train Models

In [96]:
house_tfidf = models.TfidfModel(house_bow_corpus)
house_corpus_tfidf = house_tfidf[house_bow_corpus]

In [97]:
house_lda_model = models.LdaMulticore(house_bow_corpus, num_topics=10, id2word=house_dict, passes=2, workers=2)
house_lda_model_tfidf = models.LdaMulticore(house_corpus_tfidf, num_topics=10, id2word=house_dict, passes=2, workers=4)

### Print Categories

In [98]:
for idx, topic in house_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"healthcare" + 0.032*"amp" + 0.018*"health" + 0.017*"woman" + 0.013*"people" + 0.011*"family" + 0.011*"stand" + 0.010*"right" + 0.009*"care" + 0.009*"insurance"
Topic: 1 
Words: 0.030*"healthcare" + 0.021*"health" + 0.017*"people" + 0.013*"care" + 0.012*"work" + 0.011*"get" + 0.010*"congress" + 0.010*"vote" + 0.008*"take" + 0.008*"want"
Topic: 2 
Words: 0.066*"health" + 0.034*"care" + 0.014*"need" + 0.013*"insurance" + 0.011*"affordable" + 0.011*"american" + 0.009*"healthcare" + 0.009*"trump" + 0.008*"people" + 0.008*"coverage"
Topic: 3 
Words: 0.028*"health" + 0.027*"healthcare" + 0.019*"care" + 0.011*"american" + 0.011*"fight" + 0.009*"family" + 0.009*"amp" + 0.009*"affordable" + 0.008*"job" + 0.008*"make"
Topic: 4 
Words: 0.023*"health" + 0.020*"amp" + 0.020*"care" + 0.017*"healthcare" + 0.016*"vote" + 0.010*"support" + 0.010*"access" + 0.009*"healthy" + 0.009*"take" + 0.009*"help"
Topic: 5 
Words: 0.031*"health" + 0.023*"healthcare" + 0.017*"right" + 0.017*"c

In [99]:
for idx, topic in house_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"healthcare" + 0.006*"right" + 0.006*"health" + 0.006*"care" + 0.005*"woman" + 0.005*"get" + 0.005*"amp" + 0.004*"affordable" + 0.004*"people" + 0.004*"insurance"
Topic: 1 
Words: 0.008*"healthcare" + 0.007*"care" + 0.006*"make" + 0.006*"amp" + 0.006*"health" + 0.006*"vote" + 0.006*"work" + 0.005*"support" + 0.005*"congress" + 0.005*"affordable"
Topic: 2 
Words: 0.007*"care" + 0.007*"health" + 0.006*"amp" + 0.006*"healthcare" + 0.005*"work" + 0.005*"american" + 0.005*"woman" + 0.005*"fight" + 0.004*"need" + 0.004*"vote"
Topic: 3 
Words: 0.008*"healthcare" + 0.007*"care" + 0.006*"health" + 0.006*"amp" + 0.005*"condition" + 0.005*"work" + 0.005*"family" + 0.005*"need" + 0.005*"support" + 0.005*"american"
Topic: 4 
Words: 0.007*"amp" + 0.006*"healthcare" + 0.006*"care" + 0.006*"health" + 0.005*"vote" + 0.005*"insurance" + 0.004*"need" + 0.004*"make" + 0.004*"everyone" + 0.004*"work"
Topic: 5 
Words: 0.006*"amp" + 0.006*"healthcare" + 0.006*"health" + 0.005*"care" + 

### Remove Selection Hashtags

In [4]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 10000)
select tweet_text_clean 
from random_tweets;
"""

In [6]:
dem_tweets = pd.DataFrame(db.read(data_query.format("democrat_select")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [9]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=.8, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [10]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50)

vote, 1033 times
amp, 998 times
@realdonaldtrump, 955 times
trump, 899 times
get, 719 times
#theresistance, 585 times
like, 565 times
go, 556 times
#maga, 535 times
people, 489 times
dont, 451 times
make, 422 times
#votethemout, 419 times
follow, 409 times
#trump, 408 times
need, 401 times
know, 396 times
one, 381 times
say, 361 times
time, 359 times
#fbr, 353 times
take, 352 times
let, 343 times
want, 339 times
#impeachtrump, 321 times
back, 302 times
right, 299 times
please, 289 times
see, 288 times
think, 287 times
im, 282 times
come, 279 times
good, 277 times
america, 267 times
@gop, 264 times
day, 262 times
would, 246 times
democrat, 245 times
lie, 243 times
#vote, 242 times
country, 241 times
via, 240 times
must, 234 times
#trumprussia, 234 times
work, 229 times
president, 226 times
gop, 224 times
american, 222 times
republican, 218 times
stop, 211 times


### Train Models

In [11]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [12]:
dem_lda_model = models.LdaMulticore(dem_bow_corpus, num_topics=10, id2word=dem_dict, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf, num_topics=10, id2word=dem_dict, passes=2, workers=4)

In [15]:
dem_lda_model_20 = models.LdaMulticore(dem_bow_corpus, num_topics=20, id2word=dem_dict, passes=20, workers=2)
dem_lda_model_tfidf_20 = models.LdaMulticore(dem_corpus_tfidf, num_topics=20, id2word=dem_dict, passes=20, workers=4)

In [18]:
dem_lda_model_20 = models.LdaMulticore(dem_bow_corpus, num_topics=20, id2word=dem_dict, passes=40, workers=2)
dem_lda_model_tfidf_20 = models.LdaMulticore(dem_corpus_tfidf, num_topics=20, id2word=dem_dict, passes=40, workers=4)

In [13]:
for idx, topic in dem_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.012*"blue" + 0.010*"@realdonaldtrump" + 0.009*"#electionday" + 0.009*"vote" + 0.009*"#trump" + 0.008*"#votedem" + 0.007*"wave" + 0.007*"#maga" + 0.006*"go" + 0.006*"need"
Topic: 1 
Words: 0.011*"please" + 0.009*"trump" + 0.007*"right" + 0.007*"amp" + 0.007*"vote" + 0.007*"like" + 0.006*"im" + 0.006*"retweet" + 0.006*"know" + 0.006*"#stopkavanaugh"
Topic: 2 
Words: 0.020*"@realdonaldtrump" + 0.011*"like" + 0.010*"trump" + 0.007*"go" + 0.006*"amp" + 0.006*"let" + 0.005*"#fbrparty" + 0.005*"think" + 0.005*"send" + 0.005*"already"
Topic: 3 
Words: 0.010*"go" + 0.008*"trump" + 0.007*"#bluetsunami" + 0.007*"dont" + 0.007*"gop" + 0.007*"#theresistance" + 0.006*"party" + 0.006*"tweet" + 0.005*"@realdonaldtrump" + 0.005*"get"
Topic: 4 
Words: 0.026*"#theresistance" + 0.009*"good" + 0.009*"#maga" + 0.007*"#resisters" + 0.007*"#trumprussia" + 0.007*"#trump" + 0.007*"#impeachtrump" + 0.006*"amp" + 0.006*"vote" + 0.006*"day"
Topic: 5 
Words: 0.026*"letter" + 0.026*"#resistbot" + 

In [14]:
for idx, topic in dem_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"@realdonaldtrump" + 0.016*"let" + 0.015*"via" + 0.015*"trump" + 0.015*"must" + 0.012*"know" + 0.011*"@credomobile" + 0.009*"go" + 0.008*"amp" + 0.008*"#maga"
Topic: 1 
Words: 0.032*"@realdonaldtrump" + 0.021*"#theresistance" + 0.021*"open" + 0.020*"letter" + 0.019*"#resistbot" + 0.018*"amp" + 0.015*"@gop" + 0.010*"#maga" + 0.010*"say" + 0.010*"de"
Topic: 2 
Words: 0.017*"#maga" + 0.015*"trump" + 0.013*"#trumpresign" + 0.013*"#theresistance" + 0.012*"another" + 0.010*"get" + 0.009*"good" + 0.008*"like" + 0.008*"need" + 0.008*"party"
Topic: 3 
Words: 0.018*"get" + 0.017*"#theresistance" + 0.011*"go" + 0.010*"#vote" + 0.010*"@realdonaldtrump" + 0.009*"take" + 0.008*"#maga" + 0.008*"make" + 0.008*"vote" + 0.007*"man"
Topic: 4 
Words: 0.019*"follow" + 0.016*"#fbr" + 0.016*"trump" + 0.012*"@realdonaldtrump" + 0.012*"make" + 0.011*"time" + 0.011*"let" + 0.010*"like" + 0.009*"dont" + 0.009*"back"
Topic: 5 
Words: 0.021*"go" + 0.019*"want" + 0.017*"@realdonaldtrump" + 0.

In [19]:
for idx, topic in dem_lda_model_tfidf_20.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.029*"#vote" + 0.016*"block" + 0.014*"voter" + 0.013*"talk" + 0.013*"fl" + 0.012*"vote" + 0.012*"#registertovote" + 0.012*"racist" + 0.011*"white" + 0.010*"murder"
Topic: 1 
Words: 0.018*"thank" + 0.016*"much" + 0.015*"trump" + 0.013*"mean" + 0.013*"ready" + 0.012*"vote" + 0.011*"get" + 0.010*"count" + 0.010*"go" + 0.009*"god"
Topic: 2 
Words: 0.049*"#votethemout" + 0.039*"#fbr" + 0.039*"#fbrparty" + 0.023*"#bluetsunami" + 0.021*"#neveragain" + 0.019*"#protectmueller" + 0.017*"#marchforourlives" + 0.014*"great" + 0.012*"fellow" + 0.011*"name"
Topic: 3 
Words: 0.064*"#theresistance" + 0.026*"#trumprussia" + 0.022*"#impeachtrump" + 0.016*"#trumptreason" + 0.016*"love" + 0.015*"#traitortrump" + 0.014*"#theresistance2018" + 0.014*"#muellertime" + 0.013*"#resisters" + 0.012*"#trumpcrimefamily"
Topic: 4 
Words: 0.042*"@realdonaldtrump" + 0.017*"late" + 0.013*"want" + 0.013*"youre" + 0.013*"true" + 0.012*"wait" + 0.012*"money" + 0.011*"speak" + 0.011*"#familiesbelongtogether

In [17]:
for idx, topic in dem_lda_model_20.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.034*"trump" + 0.032*"great" + 0.018*"house" + 0.017*"#electionday" + 0.017*"amp" + 0.015*"time" + 0.014*"senate" + 0.013*"investigation" + 0.012*"get" + 0.011*"thats"
Topic: 1 
Words: 0.033*"#trump" + 0.026*"#maga" + 0.026*"watch" + 0.026*"@cnn" + 0.022*"#midterms2018" + 0.022*"#qanon" + 0.022*"#impeachtrump" + 0.017*"know" + 0.016*"speak" + 0.015*"trump"
Topic: 2 
Words: 0.146*"@realdonaldtrump" + 0.070*"@gop" + 0.034*"@potus" + 0.027*"president" + 0.020*"@foxnews" + 0.019*"@senategop" + 0.019*"@housegop" + 0.016*"welcome" + 0.015*"@speakerryan" + 0.015*"@senatemajldr"
Topic: 3 
Words: 0.119*"#maga" + 0.049*"#trump" + 0.047*"#gop" + 0.024*"@realdonaldtrump" + 0.022*"#republicans" + 0.022*"#trumprussia" + 0.017*"#mueller" + 0.016*"#tcot" + 0.015*"#americafirst" + 0.014*"#liberals"
Topic: 4 
Words: 0.040*"trump" + 0.039*"party" + 0.027*"state" + 0.025*"#theresistance" + 0.023*"republican" + 0.023*"gop" + 0.019*"run" + 0.016*"thanks" + 0.015*"candidate" + 0.014*"turn"


### Remove all hashtags

In [20]:
dem_tweets = pd.DataFrame(db.read(data_query.format("democrat_all")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [21]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=.8, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [22]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50)

@realdonaldtrump, 1037 times
vote, 946 times
trump, 891 times
get, 769 times
like, 563 times
go, 545 times
people, 501 times
make, 457 times
dont, 438 times
follow, 434 times
need, 427 times
say, 409 times
time, 407 times
know, 398 times
one, 373 times
take, 370 times
want, 349 times
good, 319 times
america, 315 times
let, 313 times
think, 301 times
back, 292 times
please, 287 times
see, 281 times
day, 270 times
im, 269 times
country, 268 times
american, 267 times
right, 266 times
come, 262 times
@gop, 261 times
democrat, 246 times
republican, 245 times
would, 243 times
must, 236 times
president, 235 times
house, 234 times
election, 234 times
work, 234 times
state, 221 times
via, 219 times
lie, 217 times
every, 212 times
keep, 211 times
win, 210 times
help, 208 times
call, 206 times
gop, 202 times
party, 201 times
stop, 199 times


### Train Models

In [23]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [24]:
dem_lda_model_40_all = models.LdaMulticore(dem_bow_corpus, num_topics=20, id2word=dem_dict, passes=40, workers=2)
dem_lda_model_tfidf_40_all = models.LdaMulticore(dem_corpus_tfidf, num_topics=20, id2word=dem_dict, passes=40, workers=4)

In [25]:
for idx, topic in dem_lda_model_40_all.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.039*"youre" + 0.039*"november" + 0.028*"republican" + 0.026*"wait" + 0.024*"rt" + 0.022*"true" + 0.020*"cant" + 0.019*"candidate" + 0.017*"man" + 0.016*"talk"
Topic: 1 
Words: 0.083*"let" + 0.063*"open" + 0.050*"letter" + 0.037*"yes" + 0.032*"2018" + 0.024*"go" + 0.020*"wall" + 0.017*"hear" + 0.014*"others" + 0.013*"forget"
Topic: 2 
Words: 0.066*"love" + 0.035*"call" + 0.033*"hope" + 0.025*"would" + 0.021*"check" + 0.018*"like" + 0.018*"hold" + 0.018*"ever" + 0.017*"also" + 0.017*"best"
Topic: 3 
Words: 0.044*"trump" + 0.043*"house" + 0.042*"know" + 0.039*"white" + 0.036*"dont" + 0.030*"take" + 0.019*"would" + 0.017*"think" + 0.017*"nothing" + 0.015*"believe"
Topic: 4 
Words: 0.243*"vote" + 0.043*"please" + 0.032*"poll" + 0.026*"get" + 0.021*"dont" + 0.018*"part" + 0.018*"matter" + 0.018*"today" + 0.017*"every" + 0.017*"count"
Topic: 5 
Words: 0.050*"see" + 0.049*"people" + 0.044*"american" + 0.036*"come" + 0.031*"new" + 0.024*"year" + 0.019*"stand" + 0.016*"one" + 

In [26]:
for idx, topic in dem_lda_model_tfidf_40_all.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.147*"@realdonaldtrump" + 0.047*"@potus" + 0.038*"@gop" + 0.023*"rt" + 0.023*"@senatemajldr" + 0.023*"hell" + 0.023*"@foxnews" + 0.019*"hand" + 0.019*"@housegop" + 0.018*"@senategop"
Topic: 1 
Words: 0.049*"thank" + 0.017*"already" + 0.016*"win" + 0.015*"another" + 0.015*"sad" + 0.015*"ok" + 0.014*"blame" + 0.014*"stay" + 0.013*"feed" + 0.013*"@politico"
Topic: 2 
Words: 0.105*"follow" + 0.042*"please" + 0.030*"tweet" + 0.026*"back" + 0.022*"retweet" + 0.021*"like" + 0.019*"hey" + 0.019*"follower" + 0.019*"week" + 0.018*"twitter"
Topic: 3 
Words: 0.017*"want" + 0.016*"law" + 0.016*"sure" + 0.014*"im" + 0.014*"@thedemocrats" + 0.013*"vote" + 0.012*"middle" + 0.012*"@washingtonpost" + 0.011*"liberal" + 0.011*"criminal"
Topic: 4 
Words: 0.042*"good" + 0.022*"fire" + 0.022*"fuck" + 0.019*"would" + 0.018*"lol" + 0.017*"happy" + 0.016*"agree" + 0.016*"read" + 0.016*"presidency" + 0.015*"congratulation"
Topic: 5 
Words: 0.025*"work" + 0.024*"hear" + 0.019*"thread" + 0.017*"r

In [27]:
dem_lda_model_80_all = models.LdaMulticore(dem_bow_corpus, num_topics=40, id2word=dem_dict, passes=80, workers=2)
dem_lda_model_tfidf_80_all = models.LdaMulticore(dem_corpus_tfidf, num_topics=40, id2word=dem_dict, passes=80, workers=4)

In [28]:
for idx, topic in dem_lda_model_tfidf_80_all.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.053*"well" + 0.038*"know" + 0.035*"voter" + 0.029*"could" + 0.027*"immigrant" + 0.026*"fire" + 0.025*"racist" + 0.025*"list" + 0.021*"illegal" + 0.021*"top"
Topic: 1 
Words: 0.050*"real" + 0.046*"hell" + 0.037*"liar" + 0.035*"support" + 0.035*"@betoorourke" + 0.033*"act" + 0.028*"@politico" + 0.026*"wow" + 0.025*"medium" + 0.022*"young"
Topic: 2 
Words: 0.060*"rt" + 0.055*"hear" + 0.037*"@dnc" + 0.036*"@gop" + 0.036*"@donaldjtrumpjr" + 0.034*"continue" + 0.032*"@nra" + 0.028*"chance" + 0.023*"send" + 0.023*"worth"
Topic: 3 
Words: 0.041*"wait" + 0.036*"god" + 0.031*"wave" + 0.031*"obama" + 0.025*"reason" + 0.025*"lose" + 0.025*"red" + 0.024*"gun" + 0.023*"run" + 0.022*"sad"
Topic: 4 
Words: 0.035*"lie" + 0.030*"stand" + 0.027*"shit" + 0.025*"buy" + 0.024*"@funder" + 0.024*"mean" + 0.022*"live" + 0.021*"want" + 0.021*"write" + 0.021*"ask"
Topic: 5 
Words: 0.057*"work" + 0.042*"nothing" + 0.038*"hard" + 0.036*"men" + 0.036*"word" + 0.036*"speak" + 0.029*"push" + 0.024*