# Preliminary "Base" Topic Modeling
Citation:
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [1]:
from utils.db_client import DBClient
import pandas as pd
from gensim import corpora, models
import ast

In [3]:
db = DBClient(secrets_path = "../configs/db_secrets.json")

I'm trying
Connected to political tweets DB


In [4]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 10000)
select tweet_text_clean 
from random_tweets;
"""

In [5]:
tax_query = """
select tweet_text_clean from staging.{}
where tweet_date between '2018-01-01' and '2019-01-01'
and tweet_text_clean like '%health%'
limit 10000  
"""

In [6]:
def get_word_counts(bow_corpus):
    counts = {}
    for bow in bow_corpus:
        for word in bow:
            if word[0] not in counts.keys():
                counts[word[0]] = 0
            counts[word[0]] += word[1]
    return [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]

def print_word_counts(word_counts, num_words, word_dict):
    for tup in word_counts[0:num_words]:
        print(f"{word_dict[tup[0]]}, {tup[1]} times")

## Democrats

In [7]:
dem_tax_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))

In [8]:
dem_tax_tweets

Unnamed: 0,0
0,"['omg', '#barbaric', 'amp', '#fiendish', '#cor..."
1,"['@housedemocrats', '@gop', 'go', 'stop', 'pot..."
2,"['@tedlieu', '@speakerryan', 'might', 'nice', ..."
3,"['#democraticagenda', 'affordable', 'healthcar..."
4,"['@foxnews', '@gregabbotttx', 'stop', 'take', ..."
5,"['@pelucachick46', '@tomperez', '@nancypelosi'..."
6,"['healthcare', 'immigration', 'tax', 'reform',..."
7,"['@realdonaldtrump', '@momsdemand', '@corapunz..."
8,"['@realdonaldtrump', 'hey', '#oh12', '#gotv', ..."
9,"['im', 'sure', 'im', 'happy', 'result', 'study..."


### Preparation

In [9]:
dem_tweets = pd.DataFrame(db.read(tax_query.format("democrat")))
dem_docs = [ast.literal_eval(doc) for doc in  dem_tweets[0].tolist()]
dem_dict = corpora.Dictionary(dem_docs)

In [10]:
# Alter no_above to filter out frequently occuring words
dem_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus = [dem_dict.doc2bow(doc) for doc in dem_docs]

### Top Word Counts

In [12]:
dem_counts = get_word_counts(dem_bow_corpus)
print_word_counts(dem_counts, 50, dem_dict)

healthcare, 355 times
#voteblue, 310 times
health, 283 times
vote, 258 times
amp, 256 times
#resist, 248 times
#healthcare, 202 times
care, 183 times
#bluewave, 172 times
#democrats, 135 times
take, 121 times
#bluewave2018, 104 times
right, 99 times
need, 97 times
get, 96 times
#resistance, 94 times
away, 85 times
people, 84 times
want, 83 times
@realdonaldtrump, 81 times
trump, 80 times
like, 74 times
dont, 73 times
@gop, 72 times
go, 69 times
american, 66 times
gop, 61 times
#trump, 57 times
condition, 56 times
make, 55 times
affordable, 54 times
tax, 54 times
pay, 54 times
save, 53 times
education, 51 times
issue, 50 times
#maga, 49 times
support, 49 times
woman, 49 times
republican, 49 times
#vote, 48 times
security, 48 times
social, 48 times
time, 46 times
good, 46 times
lie, 46 times
#gop, 43 times
let, 43 times
child, 43 times
work, 42 times


### Train Models

In [13]:
dem_tfidf = models.TfidfModel(dem_bow_corpus)
dem_corpus_tfidf = dem_tfidf[dem_bow_corpus]

In [14]:
dem_lda_model = models.LdaMulticore(dem_bow_corpus, num_topics=10, id2word=dem_dict, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf, num_topics=10, id2word=dem_dict, passes=2, workers=4)

### Print Categories

In [15]:
for idx, topic in dem_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.086*"care" + 0.053*"right" + 0.045*"health" + 0.043*"#vote" + 0.037*"#bluewave" + 0.037*"#voteblue" + 0.034*"#healthcare" + 0.025*"amp" + 0.024*"issue" + 0.023*"vote"
Topic: 1 
Words: 0.086*"#voteblue" + 0.068*"healthcare" + 0.034*"tax" + 0.032*"#resist" + 0.029*"vote" + 0.021*"amp" + 0.020*"medicare" + 0.019*"health" + 0.018*"@gop" + 0.017*"let"
Topic: 2 
Words: 0.085*"healthcare" + 0.058*"#voteblue" + 0.040*"vote" + 0.022*"health" + 0.020*"#healthcare" + 0.020*"gop" + 0.020*"#resist" + 0.018*"get" + 0.018*"take" + 0.018*"care"
Topic: 3 
Words: 0.067*"amp" + 0.066*"#bluewave" + 0.065*"#healthcare" + 0.027*"#trump" + 0.023*"#womensrights" + 0.023*"#democrats" + 0.023*"#education" + 0.023*"stay" + 0.022*"#environment" + 0.022*"@gop"
Topic: 4 
Words: 0.068*"health" + 0.038*"amp" + 0.035*"take" + 0.030*"care" + 0.027*"#democrats" + 0.026*"away" + 0.024*"healthcare" + 0.023*"#bluewave" + 0.022*"#voteblue" + 0.021*"child"
Topic: 5 
Words: 0.054*"amp" + 0.047*"healthcare" 

In [16]:
for idx, topic in dem_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.038*"healthcare" + 0.022*"#voteblue" + 0.022*"take" + 0.022*"want" + 0.021*"must" + 0.021*"#bluewave2018" + 0.020*"away" + 0.020*"#resist" + 0.018*"health" + 0.016*"trump"
Topic: 1 
Words: 0.045*"#bluewave2018" + 0.031*"#votebluetosaveamerica" + 0.030*"#bluewave" + 0.025*"#resistance" + 0.020*"amp" + 0.019*"know" + 0.019*"child" + 0.018*"people" + 0.016*"strong" + 0.016*"use"
Topic: 2 
Words: 0.043*"vote" + 0.023*"right" + 0.023*"amp" + 0.023*"#vote" + 0.022*"#resistance" + 0.021*"get" + 0.021*"#resist" + 0.020*"trump" + 0.018*"health" + 0.018*"good"
Topic: 3 
Words: 0.022*"#womenshealth" + 0.022*"#healthcare" + 0.021*"health" + 0.020*"gun" + 0.019*"vote" + 0.018*"#gop" + 0.018*"#democrats" + 0.018*"public" + 0.016*"issue" + 0.016*"#voteblue"
Topic: 4 
Words: 0.025*"lie" + 0.025*"#voteblue" + 0.024*"#resist" + 0.024*"care" + 0.022*"access" + 0.022*"need" + 0.021*"vote" + 0.021*"health" + 0.020*"cut" + 0.020*"go"
Topic: 5 
Words: 0.045*"#healthcare" + 0.029*"#democrat

## Republicans

### Preparation

In [17]:
rep_tweets = pd.DataFrame(db.read(tax_query.format("republican")))
rep_docs = [ast.literal_eval(doc) for doc in  rep_tweets[0].tolist()]
rep_dict = corpora.Dictionary(rep_docs)

In [18]:
# Adjust no_above to filter frequently occuring words
rep_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)

In [19]:
rep_bow_corpus = [rep_dict.doc2bow(doc) for doc in rep_docs]

### Top Word Counts

In [20]:
rep_counts = get_word_counts(rep_bow_corpus)
print_word_counts(rep_counts, 50, rep_dict)

#maga, 277 times
health, 243 times
#trump, 201 times
healthcare, 129 times
#gop, 118 times
#healthcare, 104 times
care, 85 times
@realdonaldtrump, 84 times
trump, 68 times
mental, 67 times
get, 59 times
people, 58 times
make, 56 times
dont, 53 times
need, 52 times
vote, 47 times
want, 45 times
tax, 45 times
take, 44 times
go, 42 times
gun, 40 times
#health, 39 times
follow, 38 times
insurance, 38 times
give, 37 times
right, 36 times
like, 36 times
america, 35 times
back, 34 times
good, 34 times
cut, 33 times
pay, 33 times
issue, 32 times
say, 32 times
would, 32 times
@gop, 30 times
@potus, 29 times
healthy, 29 times
american, 29 times
child, 28 times
one, 28 times
let, 28 times
please, 27 times
school, 27 times
know, 27 times
president, 26 times
work, 26 times
country, 25 times
talk, 24 times
stop, 24 times


### Train Models

In [21]:
rep_tfidf = models.TfidfModel(rep_bow_corpus)
rep_corpus_tfidf = rep_tfidf[rep_bow_corpus]

In [22]:
rep_lda_model = models.LdaMulticore(rep_bow_corpus, num_topics=10, id2word=rep_dict, passes=2, workers=2)
rep_lda_model_tfidf = models.LdaMulticore(rep_corpus_tfidf, num_topics=10, id2word=rep_dict, passes=2, workers=4)

### Print Categories

In [23]:
for idx, topic in rep_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.219*"#maga" + 0.083*"health" + 0.036*"follow" + 0.030*"mental" + 0.023*"get" + 0.023*"back" + 0.022*"gun" + 0.022*"thanks" + 0.022*"@realdonaldtrump" + 0.021*"school"
Topic: 1 
Words: 0.119*"health" + 0.061*"#trump" + 0.057*"need" + 0.047*"#gop" + 0.046*"insurance" + 0.040*"@realdonaldtrump" + 0.036*"#maga" + 0.032*"go" + 0.028*"people" + 0.026*"care"
Topic: 2 
Words: 0.079*"healthcare" + 0.055*"health" + 0.045*"child" + 0.040*"vote" + 0.039*"#maga" + 0.037*"good" + 0.033*"#gop" + 0.031*"one" + 0.029*"@gop" + 0.026*"dont"
Topic: 3 
Words: 0.155*"#trump" + 0.118*"#healthcare" + 0.049*"#gop" + 0.040*"action" + 0.036*"#environment" + 0.028*"@gop" + 0.024*"#health" + 0.022*"healthcare" + 0.021*"vote" + 0.021*"make"
Topic: 4 
Words: 0.094*"#trump" + 0.066*"#healthcare" + 0.064*"try" + 0.059*"get" + 0.048*"people" + 0.047*"go" + 0.043*"take" + 0.041*"cut" + 0.035*"healthcare" + 0.034*"let"
Topic: 5 
Words: 0.096*"health" + 0.087*"#gop" + 0.077*"mental" + 0.060*"#trump" + 0

In [24]:
for idx, topic in rep_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.075*"#mentalhealth" + 0.058*"#trump" + 0.047*"want" + 0.042*"gun" + 0.038*"health" + 0.038*"one" + 0.029*"school" + 0.025*"#gop" + 0.025*"mental" + 0.024*"follow"
Topic: 1 
Words: 0.051*"insurance" + 0.050*"need" + 0.039*"try" + 0.035*"right" + 0.033*"make" + 0.031*"people" + 0.031*"#trump" + 0.030*"health" + 0.030*"@realdonaldtrump" + 0.029*"healthy"
Topic: 2 
Words: 0.038*"back" + 0.036*"make" + 0.036*"#maga" + 0.028*"keep" + 0.027*"health" + 0.026*"need" + 0.026*"america" + 0.024*"see" + 0.024*"great" + 0.024*"@realdonaldtrump"
Topic: 3 
Words: 0.077*"#healthcare" + 0.051*"#trump" + 0.041*"#gop" + 0.038*"would" + 0.037*"#maga" + 0.033*"issue" + 0.033*"vote" + 0.032*"please" + 0.029*"follow" + 0.027*"people"
Topic: 4 
Words: 0.055*"get" + 0.054*"every" + 0.049*"healthcare" + 0.039*"like" + 0.039*"even" + 0.036*"life" + 0.032*"#maga" + 0.029*"say" + 0.027*"stop" + 0.027*"insurance"
Topic: 5 
Words: 0.121*"#maga" + 0.101*"health" + 0.060*"mental" + 0.047*"#trump" + 0

# House

In [25]:
house_tweets = pd.DataFrame(db.read(tax_query.format("house")))
house_docs = [ast.literal_eval(doc) for doc in  house_tweets[0].tolist()]
house_dict = corpora.Dictionary(house_docs)

In [26]:
house_dict.filter_extremes(no_below=15, no_above=1, keep_n=10000)
house_bow_corpus = [house_dict.doc2bow(doc) for doc in house_docs]

### Top Word Counts

In [27]:
house_counts = get_word_counts(house_bow_corpus)
print_word_counts(house_counts, 50, house_dict)

health, 5470 times
healthcare, 3821 times
care, 3043 times
amp, 2365 times
need, 1363 times
people, 1330 times
work, 1228 times
fight, 1154 times
american, 1122 times
affordable, 1106 times
make, 1086 times
vote, 1073 times
family, 1045 times
access, 997 times
right, 867 times
congress, 863 times
insurance, 827 times
get, 819 times
woman, 797 times
support, 794 times
protect, 794 times
im, 755 times
today, 715 times
condition, 708 times
take, 698 times
community, 681 times
cost, 675 times
time, 653 times
help, 645 times
public, 618 times
year, 608 times
good, 601 times
education, 596 times
healthy, 587 times
million, 584 times
issue, 578 times
must, 567 times
coverage, 551 times
system, 548 times
one, 547 times
like, 545 times
trump, 543 times
plan, 532 times
job, 532 times
new, 520 times
quality, 513 times
would, 502 times
want, 490 times
mental, 474 times
tax, 472 times


### Train Models

In [28]:
house_tfidf = models.TfidfModel(house_bow_corpus)
house_corpus_tfidf = house_tfidf[house_bow_corpus]

In [29]:
house_lda_model = models.LdaMulticore(house_bow_corpus, num_topics=10, id2word=house_dict, passes=2, workers=2)
house_lda_model_tfidf = models.LdaMulticore(house_corpus_tfidf, num_topics=10, id2word=house_dict, passes=2, workers=4)

### Print Categories

In [30]:
for idx, topic in house_lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.043*"healthcare" + 0.013*"people" + 0.012*"amp" + 0.012*"health" + 0.010*"run" + 0.009*"congress" + 0.009*"security" + 0.009*"support" + 0.009*"great" + 0.008*"community"
Topic: 1 
Words: 0.031*"healthcare" + 0.025*"health" + 0.013*"amp" + 0.012*"today" + 0.010*"year" + 0.010*"community" + 0.009*"work" + 0.009*"healthy" + 0.009*"need" + 0.008*"gun"
Topic: 2 
Words: 0.034*"health" + 0.027*"amp" + 0.024*"care" + 0.023*"right" + 0.018*"woman" + 0.017*"healthcare" + 0.014*"vote" + 0.010*"get" + 0.009*"make" + 0.008*"today"
Topic: 3 
Words: 0.044*"health" + 0.025*"amp" + 0.012*"healthcare" + 0.012*"care" + 0.011*"public" + 0.011*"work" + 0.010*"community" + 0.009*"need" + 0.009*"family" + 0.008*"support"
Topic: 4 
Words: 0.039*"healthcare" + 0.022*"health" + 0.020*"access" + 0.019*"affordable" + 0.017*"need" + 0.015*"fight" + 0.015*"care" + 0.014*"amp" + 0.013*"family" + 0.011*"work"
Topic: 5 
Words: 0.029*"healthcare" + 0.028*"health" + 0.015*"work" + 0.012*"american" + 

In [31]:
for idx, topic in house_lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"healthcare" + 0.007*"amp" + 0.006*"care" + 0.006*"health" + 0.006*"condition" + 0.005*"community" + 0.005*"people" + 0.005*"need" + 0.004*"million" + 0.004*"american"
Topic: 1 
Words: 0.007*"care" + 0.007*"healthcare" + 0.006*"health" + 0.005*"amp" + 0.005*"need" + 0.005*"insurance" + 0.005*"coverage" + 0.005*"work" + 0.005*"plan" + 0.004*"get"
Topic: 2 
Words: 0.008*"amp" + 0.006*"healthcare" + 0.006*"care" + 0.006*"health" + 0.006*"fight" + 0.005*"family" + 0.005*"work" + 0.005*"need" + 0.005*"issue" + 0.004*"education"
Topic: 3 
Words: 0.008*"healthcare" + 0.007*"care" + 0.007*"amp" + 0.006*"health" + 0.006*"vote" + 0.005*"need" + 0.005*"access" + 0.004*"affordable" + 0.004*"right" + 0.004*"fight"
Topic: 4 
Words: 0.007*"healthcare" + 0.006*"care" + 0.006*"health" + 0.006*"amp" + 0.006*"affordable" + 0.005*"work" + 0.005*"insurance" + 0.005*"system" + 0.005*"thank" + 0.004*"fight"
Topic: 5 
Words: 0.007*"healthcare" + 0.007*"care" + 0.006*"health" + 0.006*"am

## Democrat Cleaning Tests

In [32]:
dem_tweets_select = pd.DataFrame(db.read(data_query.format("democrat_select")))
dem_docs_select = [ast.literal_eval(doc) for doc in  dem_tweets_select[0].tolist()]
dem_dict_select = corpora.Dictionary(dem_docs_select)

In [33]:
# Alter no_above to filter out frequently occuring words
dem_dict_select.filter_extremes(no_below=15, no_above=1, keep_n=10000)
dem_bow_corpus_select = [dem_dict_select.doc2bow(doc) for doc in dem_docs_select]

In [34]:
dem_counts_select = get_word_counts(dem_bow_corpus_select)
print_word_counts(dem_counts_select, 50, dem_dict_select)

vote, 1020 times
@realdonaldtrump, 940 times
trump, 935 times
amp, 907 times
get, 744 times
go, 556 times
#theresistance, 556 times
like, 524 times
people, 512 times
#maga, 484 times
dont, 480 times
make, 464 times
need, 454 times
say, 394 times
follow, 387 times
#trump, 373 times
time, 371 times
#votethemout, 368 times
take, 362 times
know, 360 times
one, 355 times
want, 347 times
let, 335 times
#fbr, 310 times
right, 305 times
please, 297 times
see, 296 times
#impeachtrump, 293 times
good, 289 times
come, 287 times
day, 284 times
back, 278 times
democrat, 277 times
america, 273 times
republican, 272 times
@gop, 262 times
im, 261 times
would, 252 times
think, 250 times
work, 250 times
country, 239 times
american, 238 times
#vote, 235 times
must, 234 times
election, 233 times
president, 228 times
lie, 227 times
via, 221 times
party, 220 times
gop, 217 times


In [35]:
dem_tfidf_select = models.TfidfModel(dem_bow_corpus_select)
dem_corpus_tfidf_select = dem_tfidf_select[dem_bow_corpus_select]
dem_lda_model = models.LdaMulticore(dem_bow_corpus_select, num_topics=10, id2word=dem_dict_select, passes=2, workers=2)
dem_lda_model_tfidf = models.LdaMulticore(dem_corpus_tfidf_select, num_topics=10, id2word=dem_dict_select, passes=2, workers=4)

