In [104]:
import json
import requests
import traceback
import re
import nltk
import string
import os
import wordcloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from pprint import pprint
from datetime import datetime   
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from itertools import islice

In [229]:
url = "https://api.pushshift.io/reddit/search/{}/?q={}&subreddit={}&sort=asc&limit=1000&metadata=true&after="
def get_data(leader, object_type, subreddit, start_time):
    leaders = []
    texts = []
    scores = []
    dates = []
    count = 0
    while True:
        new_url = url.format(object_type, leader, subreddit)+str(start_time)
        print(new_url)
        data = requests.get(new_url, headers = {'User-agent': 'Mallory'})
        json_data = data.json()
        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break
        for object in objects:
            start_time = (object['created_utc'])
            count += 1
            if object_type == 'comment':
                try:
                    text = object['body']
                    texts.append(text)
                    date = object['created_utc']
                    dates.append(date)
                    score = object['score']
                    scores.append(score)
                    leaders.append(leader)
                except Exception as err:
                    print(f"Couldn't print comment")
                    print(traceback.format_exc())
    return leaders, texts, scores, dates
    

In [106]:
def create_dfs(subreddit):
    types = ['comment']
    queries = ['fauci', 'cuomo', 'trump']
    start = 1582934399
    for obj_type in types:
        leaders = []
        texts = []
        scores = []
        dates = []
        for person in queries:
            names, comments, votes, days = get_data(person, obj_type, subreddit, start)
            leaders.extend(names)
            texts.extend(comments)
            scores.extend(votes)
            dates.extend(days)
        df = pd.DataFrame(list(zip(leaders, texts, scores, dates)), 
                columns =['Leader', 'Text', 'Score', 'Date'])
        filename = subreddit + obj_type + ".csv"
        df.to_csv(filename)

In [230]:
# create_dfs('politics')
# create_dfs('coronavirus')
# create_dfs('uspolitics')
# create_dfs('libertarian')
# create_dfs('neoliberal')
# create_dfs('democrats')
# create_dfs('progressive')
# create_dfs('conservative')
# create_dfs('republican')
# create_dfs('newyorkcity')
# create_dfs('moderatepolitics')
# create_dfs('news')

In [170]:
def clean_text(comments):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"*$%()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    HTML = re.compile("&gt")
    NEWLINE = re.compile("\n")
    QUOT = re.compile("'")
    DOUBQUOT = re.compile("\"")
    LINK = re.compile("http\S+")
    comments = [LINK.sub("", str(line)) for line in comments]
    comments = [REPLACE_NO_SPACE.sub("", line.lower()) for line in comments]
    comments = [REPLACE_WITH_SPACE.sub(" ", line) for line in comments]
    comments = [HTML.sub("", line) for line in comments]
    comments = [NEWLINE.sub(" ", line) for line in comments]
    comments = [QUOT.sub(" ", line) for line in comments]
#     pprint(comments[:10])
    return comments

In [203]:
def plot_sentiments(sub, f_neg, f_neu, f_pos, c_neg, c_neu, c_pos, t_neg, t_neu, t_pos):
    fig, axs = plt.subplots(3, 3, figsize=(16,12), sharey='row', sharex='all')

    axs[0,0].hist(f_neg, color='green')
    axs[0,0].title.set_text('fauci neg')
    axs[0,0].axvline(np.median(f_neg), color='k', linestyle='dashed')
    axs[0,0].legend(['median = ' + '%.3f' % np.median(f_neg)])

    axs[0,1].hist(f_neu, color='green')
    axs[0,1].title.set_text('fauci neu')
    axs[0,1].axvline(np.mean(f_neu), color='k', linestyle='dashed')
    axs[0,1].legend(['mean = ' + '%.3f' % np.mean(f_neu)])

    axs[0,2].hist(f_pos, color='green')
    axs[0,2].title.set_text('fauci pos')
    axs[0,2].axvline(np.mean(f_pos), color='k', linestyle='dashed')
    axs[0,2].legend(['mean = ' + '%.3f' % np.mean(f_pos)])



    axs[1,0].hist(c_neg, color='blue')
    axs[1,0].title.set_text('cuomo neg')
    axs[1,0].axvline(np.mean(c_neg), color='k', linestyle='dashed')
    axs[1,0].legend(['mean = ' + '%.3f' % np.mean(c_neg)])

    axs[1,1].hist(c_neu, color='blue')
    axs[1,1].title.set_text('cuomo neu')
    axs[1,1].axvline(np.mean(c_neu), color='k', linestyle='dashed')
    axs[1,1].legend(['mean = ' + '%.3f' % np.mean(c_neu)])

    axs[1,2].hist(c_pos, color='blue')
    axs[1,2].title.set_text('cuomo pos')
    axs[1,2].axvline(np.mean(c_pos), color='k', linestyle='dashed')
    axs[1,2].legend(['mean = ' + '%.3f' % np.mean(c_pos)])



    axs[2,0].hist(t_neg, color='red')
    axs[2,0].title.set_text('trump neg')
    axs[2,0].axvline(np.mean(t_neg), color='k', linestyle='dashed')
    axs[2,0].legend(['mean = ' + '%.3f' % np.mean(t_neg)])

    axs[2,1].hist(t_neu, color='red')
    axs[2,1].title.set_text('trump neu')
    axs[2,1].axvline(np.mean(t_neu), color='k', linestyle='dashed')
    axs[2,1].legend(['mean = ' + '%.3f' % np.mean(t_neu)])

    axs[2,2].hist(t_pos, color='red')
    axs[2,2].title.set_text('trump pos')
    axs[2,2].axvline(np.mean(t_pos), color='k', linestyle='dashed')
    axs[2,2].legend(['mean = ' + '%.3f' % np.mean(t_pos)])
    
    title = 'Sentiment Analysis Distribution of Comments in r/%s' % sub
    fig.suptitle(title, fontsize=14)
    plt.savefig('%ssent.png' % sub)
    plt.show()

In [228]:
def compute_sentiment(df):
    fauci_sents = []
    cuomo_sents = []
    trump_sents = []
    all_sents = []
    sid = SentimentIntensityAnalyzer()
    sentiments = []
    for i in range(len(df)):
        pol = sid.polarity_scores(df.loc[i, 'Text'])
        all_sents.append(pol)
        if (df.loc[i, 'Leader']=='fauci') and (df.loc[i, 'Independence']=='True'):
            fauci_sents.append(pol)
        elif (df.loc[i, 'Leader']=='cuomo') and (df.loc[i, 'Independence']=='True'):
            cuomo_sents.append(pol)
        elif (df.loc[i, 'Leader']=='trump') and (df.loc[i, 'Independence']=='True'):
            trump_sents.append(pol)
    return fauci_sents, cuomo_sents, trump_sents, all_sents

In [172]:
def check_indep(df):
    indep = []
    for i in range(len(df)):
        if ((df.loc[i, 'Leader']=='fauci' or df.loc[i, 'Leader']=='cuomo') and ('trump' in df.loc[i, 'Text'] )) :
            indep.append('False')
        elif ((df.loc[i, 'Leader']=='trump') and (('fauci' in df.loc[i, 'Text']) or ('cuomo' in df.loc[i, 'Text']))) :
            indep.append('False')
        else:
            indep.append('True')
    return indep

In [231]:
def explore_scores(df, sub):
    df.sort_values(by=['Score'])
    print('Top 10 Comments in r/%s' % sub)
    for i in range(0,10):
        print(i+1)
        pprint(df.loc[i, 'Text'])

In [227]:
subreddits = ['democrats', 'news', 'uspolitics', 'republican', 'neoliberal', 'coronavirus', 'politics', 'libertarian',
              'newyorkcity', 'progressive', 'moderatepolitics', 'conservative']
for sub in subreddits:
    filename = 'data/%scomment.csv' % sub
    df = pd.read_csv(filename, sep=',')
    explore_scores(df, sub)
    df['Text'] = clean_text(df['Text'])
    df['Independence'] = check_indep(df)
#     f_sent, c_sent, t_sent, all_sents = compute_sentiment(df)
    df['Sentiments'] = all_sents

Top 10 Comments in r/democrats
1
('Trump is lucky to ever be allowed in the same room as someone like Tony '
 'Fauci. He was the one who was able to get Trump to have the part about AIDS '
 'funding in his 2019 SOTU speech. Hero vs. Zero.')
2
('The Times reported, the administration instructed Fauci "not to say anything '
 'else without clearance." A NIAID spokesperson told Business Insider that '
 '"this is not true," however.')
3
('Quote\n'
 '\n'
 'This is not the first coronavirus threat in the United States, and it will '
 'not be the last,” said Senator Markey. “With three novel coronavirus '
 'outbreaks in the last 18 years, finding a vaccine that treats all '
 'coronaviruses would be a milestone for global health and safety. We have the '
 'best scientists and researchers on the planet, and the United States should '
 'lead the world in containing this crisis and preventing future ones. By '
 'providing dedicated, consistent resources, we can perform the basic '
 'scientific res

Top 10 Comments in r/uspolitics
1
("that is a complete misrepresentation of what he said. it's right in the "
 'article.\n'
 '\n'
 '&gt;“Well, I think the 3.4% is really a false number,” the President told '
 'the conservative cable network’s Sean Hannity. “Now, this is just my hunch, '
 'and, but based on a lot of conversation with a lot of people who do this, '
 'because a lot of people will have this and it’s very mild. They’ll get '
 'better very rapidly. They don’t even see a doctor. They don’t even call a '
 'doctor. You never hear about those people, so you can’t put them down in the '
 'category of the overall population in terms of this corona flu, and, or '
 'virus, so you just can’t do that,” Trump said, clearly trying to minimize '
 'the danger and the numbers of people infected.\n'
 '\n'
 '&gt;“So we have thousands or hundreds of thousands of people that just get '
 'better, by, you know, sitting around and even going to work – some of ’em go '
 'to work – but they get bet

Top 10 Comments in r/republican
1
("I don't believe he is because of the blatant outright lies that come "
 'directly from his mouth or his twitter.  In his address, maybe last '
 'Wednesday, he stated that, "millions of Americans will be able to be '
 'tested."  The very next day Dr. Fauci confirmed that no, we do not have the '
 'capability to test at that level at all.\n'
 '\n'
 "So that's just one example.  Republican/Democrat, I always felt like Bush or "
 'Obama were telling you the truth.  Sure it is from their perspective, but in '
 'that respect it was the truth.\n'
 '\n'
 'With Trump, he has often told outright lies and it is just too much and '
 'extremely disappointing in a time of such uncertainty.\n'
 '\n'
 'The choices I need to make in my brain are that A: He is outright lying to '
 'me, or B:  He is completely incompetent.  In either case I am done with the '
 'man and the damage he has done to the GOP will be long lasting.\n'
 '\n'
 'Please practice as much social dis

Top 10 Comments in r/neoliberal
1
('fauci is 100 trustworthy redfield is 90 trustworthy pence and trump just '
 'show that  #technocracy good actually')
2
'trump looks like hes going to fire fauci the moment he gets off stage'
3
('basically fauci is the only one up there talking science and everyone else '
 'is like please oh mighty american consumer dont throw the economy into a '
 'volcano')
4
('the guys up there at 100 strength  the guys up there at 99 strength its just '
 'fauci')
5
'12 18 months according to doctor fauci'
6
('if he just shut his mouth and let competent people like fauci do what they '
 'needed to from the outset in january i think he would have won comfortably '
 'in november  as it stands i think its going to hurt him a lot')
7
('dr fauci looks like hes aged years in the last few weeks  or maybe thats '
 'just cspans camera  ')
8
('dr fauci just testified on the hill that coronavirus is at least 10x as '
 'lethal as the seasonal flu')
9
('dr fauci testifies that 

Top 10 Comments in r/libertarian
1
('Anthony S. Fauci, director of the National Institute of Allergy and '
 'Infectious Diseases, has said that the mortality rate for seasonal flu is '
 '0.1 percent. \n'
 '\n'
 'Notably, the research showed that patients ages 10 to 19 had the same chance '
 'of dying from COVID-19 as patients in their 20s and 30s, but the disease '
 'appeared to be much more fatal in people ages 50 and over.\xa0\n'
 'About 80% of COVID-19 cases are mild, the research showed, and experts think '
 "many mild cases haven't been reported because some people aren't going to "
 'the doctor or hospitals for treatment.')
2
('After the first couple of planned PR questions it did turn into a "gotcha" '
 'session.  These reporters had access to Dr. Fauci and other experts, plus '
 'representatives of the business community, and instead of asking the experts '
 'questions about the disease and its spread or asking the businessmen about '
 'how they can help or how this is impactin

Top 10 Comments in r/moderatepolitics
1
('heres the thing hes right that the 34 number is probably high but saying its '
 'just based on his gut is incredibly dumb base it on the figures coming out '
 'of south korea or the cruise ship base it on the latest data from who hell '
 'just cite your own director of niaid anthony fauci remember him trump he '
 'stood next to you at your pressers   but that would require trump to '
 'actually know any of those things which   lets be honest   is a real stretch '
 'to assume of him')
2
(' what advice would you give the potus about his behaviors around '
 'disseminating reliable info about this virus  shut the fuck up about any '
 'actual details and let fauci talk stick to were working on it and weve got '
 'the very best people and your other trumpy slogans dont speak about '
 'statistics or figures you cant back up')
3
('shut the fuck up about any actual details and let fauci talk stick to were '
 'working on it and weve got the very best peo

Top 10 Comments in r/conservative
1
("If you watched the President's briefing, Dr. Fauci said there is an American "
 'company with a vaccine candidate in testing. But a complete set of tests '
 "won't be completed for [at best 12-18 "
 'months.](https://www.foxnews.com/media/dr-anthony-fauci-coronavirus-vaccine-be-prepared)\n'
 '\n'
 '&gt;"That whole process is going to take about a year to a year and a half. '
 "Even though we are going as fast as you possibly can, it's still going to "
 'take a good year, year and a half to see if we have a vaccine that works."')
2
("The doctor that spoke at the President's press conference [said 12-18 "
 'months.](https://www.foxnews.com/media/dr-anthony-fauci-coronavirus-vaccine-be-prepared)')
3
("No, you don't understand how diseases work. Not everyone who gets the flu "
 'develops symptoms or gets sick enough to go to the ER, or gets sick enough '
 "to get admitted, or ends up in the ICU and ultimately dies. It's not a "
 "'time' thing.\n"
 '\n'

In [None]:
def TFIDF(leader):
    #stemmer = SnowballStemmer("english")
    #leader['Text'] = leader.Text.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
    
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(['bernie','just','like','say','trump', 'biden', 'coronavirus', 'fauci','cuomo', 'html',
                      'https', 'www', 'com', 'go', 'know', 'gt', 'pretty', 'us', 'really', 'also', 'much',
                      'see', 'make', 'let', 'way', 'still', 'could', 'trying', 'take', 'anthony', 'today',
                      'want', 'back', 'donald', 'sanders', 'thing', 'said', 'andrew', 'chris', '000', 'amp',
                      'says', 'every', 'de', 'lot'])
    
    cvec = CountVectorizer(stop_words=stopwords, min_df=.0025, max_df=.1, ngram_range=(1,3))
    cvec.fit(leader.Text)
    #print(list(islice(cvec.vocabulary_.items(), 20)))
    #print(len(cvec.vocabulary_))
    
    cvec_counts = cvec.transform(leader.Text)
    #print ('sparse matrix shape:', cvec_counts.shape)
    #print ('nonzero count:', cvec_counts.nnz)
    #print ('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))
    
    occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
    counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
    print("40 most common terms:")
    print(counts_df.sort_values(by='occurrences', ascending=False).head(40))
    print()
    
    transformer = TfidfTransformer()
    transformed_weights = transformer.fit_transform(cvec_counts)
    
    print("40 highest weighted terms:")
    weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
    print(weights_df.sort_values(by='weight', ascending=False).head(40))

In [None]:
print('CUOMO:')
print()
TFIDF(cuomo)

In [None]:
print('FAUCI:')
print()
TFIDF(fauci)

In [None]:
print('TRUMP:')
print()
TFIDF(trump)

In [None]:
print('OVERALL:')
print()
TFIDF(overall)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['bernie','just','like','say','trump', 'biden', 'coronavirus', 'fauci','cuomo', 'html', 
                  'https', 'www', 'com', 'go', 'know', 'gt', 'pretty', 'us', 'really', 'also', 'much',
                  'see', 'make', 'let', 'way', 'still', 'could', 'trying', 'take', 'anthony', 'today',
                  'want', 'back', 'donald', 'sanders', 'thing', 'said', 'andrew', 'chris', '000', 'amp',
                  'says', 'every', 'de', 'lot', 'would', 'even', 'one', 'new', 'york', 'he', 'that',
                  'people', 'saying', 'going', 'well', 'guy'])

In [None]:
ctext = " ".join(post for post in cuomo.Text)
wc = wordcloud.WordCloud(stopwords=stopwords, background_color="white").generate(ctext)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
ftext = " ".join(post for post in fauci.Text)
wc = wordcloud.WordCloud(stopwords=stopwords, background_color="white").generate(ftext)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
ttext = " ".join(post for post in trump.Text)
wc = wordcloud.WordCloud(stopwords=stopwords, background_color="white").generate(ttext)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()