In [52]:
from db_client import DBClient
from analysis.data_clean import DataClean
import pandas as pd
import ast
from nltk.util import ngrams
from gensim import corpora, models
import csv

In [3]:
def get_word_counts(bow_corpus):
    counts = {}
    for bow in bow_corpus:
        for word in bow:
            if word[0] not in counts.keys():
                counts[word[0]] = 0
            counts[word[0]] += word[1]
    return [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]

def print_word_counts(word_counts, num_words, word_dict):
    for tup in word_counts[0:num_words]:
        print(f"{word_dict[tup[0]]}, {tup[1]} times")

In [4]:
# create db client and set seed
db = DBClient()
db.write(["SELECT setseed(0.5)"])

Connected to political tweets DB
db write committed


In [12]:
select_hash_dem = ["#bluewave2018", "#bluewave18", "#bluewave", "#democrats", "#resist", "#resistance", "#voteblue", 
"#Votethemout", "#WinBlue", "#YesWeCan", "#FlipItBlue"]

select_hash_rep = ["#trump", "#maga", "#gop", "#republican", "#trumptrain", "#kag", "#LeadRight", "#VoteRed", 
"#RedWave", "#VoteDemsOut"]

select_hash = select_hash_dem + select_hash_rep
dc = DataClean(select_hash)

In [5]:
data_query = """
with random_tweets as (
    select tweet_text_clean, Random() from staging.{}
    where tweet_date between '2018-01-01' and '2019-01-01'
    order by Random()
    limit 50000)
select tweet_text_clean 
from random_tweets;
"""

In [19]:
docs = []
for table in ["democrat", "republican", "house", "senate"]:
    tweets = pd.DataFrame(db.read(data_query.format(table)))
    docs = docs + [ast.literal_eval(doc) for doc in tweets[0].tolist()]
    

['@knoone413',
 '@superstimpy',
 '@nbcolympics',
 'insult',
 '#socialism',
 '#feelthebern',
 'folk',
 'know',
 'ask',
 'also',
 '#imwithher',
 '#democrats',
 'also',
 'pull',
 '#fakenews',
 'especially',
 '@cnn',
 '@nytimes',
 '#maga',
 '#trumptrain']

In [20]:
def bigrams(tweet):
    return ["_".join(w) for w in ngrams(tweet, 2)]

In [25]:
bigrams = list(map(bigrams, docs))

  


In [32]:
bigrams = [tweet for tweet in bigrams if len(tweet) > 0]

In [35]:
bigrams[10]

['@lindasuhler_im',
 'im_perma',
 'perma_#shadowbanned',
 '#shadowbanned_#retweet',
 '#retweet_would',
 'would_heres',
 'heres_true',
 'true_story',
 'story_#gentleman',
 '#gentleman_#democrats',
 '#democrats_#progressives',
 '#progressives_#metoo',
 '#metoo_#timesup',
 '#timesup_#womensmarch2018',
 '#womensmarch2018_#trumprussia',
 '#trumprussia_#trumprussiaconspiracy',
 '#trumprussiaconspiracy_#trumprussiacoverup',
 '#trumprussiacoverup_#trumpcolluded',
 '#trumpcolluded_video']

In [39]:
bigrams_clean = list(map(lambda x: [bigram for bigram in x if (not bigram.startswith("#")) and (not bigram.startswith("@"))
                        and ("_#" not in bigram) and ("_@" not in bigram)], bigrams))

In [42]:
bigrams_clean = [tweet for tweet in bigrams_clean if len(tweet) > 0]

In [46]:
bigram_dict = corpora.Dictionary(bigrams_clean)
bigram_bow_corpus = [bigram_dict.doc2bow(doc) for doc in bigrams_clean]
bigram_counts = get_word_counts(bigram_bow_corpus)
print_word_counts(bigram_counts, 50, bigram_dict)

president_trump, 1977 times
health_care, 1842 times
make_sure, 1759 times
look_forward, 1390 times
donald_trump, 1072 times
united_state, 1068 times
tax_cut, 1034 times
last_night, 945 times
every_day, 940 times
trump_administration, 931 times
white_house, 926 times
supreme_court, 905 times
follow_back, 898 times
im_proud, 897 times
american_people, 891 times
get_vote, 882 times
election_day, 808 times
look_like, 802 times
town_hall, 790 times
year_ago, 749 times
gun_violence, 693 times
work_hard, 682 times
social_security, 669 times
small_business, 660 times
preexist_condition, 651 times
law_enforcement, 644 times
open_letter, 640 times
knock_door, 637 times
hard_work, 615 times
work_together, 579 times
god_bless, 560 times
early_voting, 550 times
men_woman, 542 times
climate_change, 534 times
register_vote, 522 times
need_help, 515 times
last_year, 510 times
high_school, 498 times
cant_wait, 484 times
last_week, 482 times
great_time, 480 times
make_america, 461 times
across_country, 

In [53]:
def word_count_to_csv(word_counts, num_words, word_dict):
    with open('topic_bigrams.csv','w') as f:
        writer = csv.writer(f, delimiter= ',')
        for tup in word_counts[0:num_words]:
            writer.writerow([word_dict[tup[0]], tup[1]])

In [54]:
word_count_to_csv(bigram_counts, 1000, bigram_dict)