<a href="https://colab.research.google.com/github/fawazshah/Reddit-Analysis/blob/main/4_sentiment_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import pandas as pd

nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




### Loading data

In [8]:
submissions_lib_dem_con_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_liberal_democrats_conservative_republicans.tsv'
submissions_lib_dem_con_rep_df = pd.read_csv(submissions_lib_dem_con_rep_url, sep='\t')

comments_lib_dem_con_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_liberal_democrats_conservative_republicans.tsv'
comments_lib_dem_con_rep_df = pd.read_csv(comments_lib_dem_con_rep_url, sep='\t')

submissions_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
submissions_ob_clin_sls_df = pd.read_csv(submissions_ob_clin_sls_url, sep='\t')

comments_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
comments_ob_clin_sls_df = pd.read_csv(comments_ob_clin_sls_url, sep='\t')

In [13]:
submissions_df = pd.concat([submissions_lib_dem_con_rep_df, submissions_ob_clin_sls_df], ignore_index=True)
comments_df = pd.concat([comments_lib_dem_con_rep_df, comments_ob_clin_sls_df], ignore_index=True)

### Data checking

In [17]:
print(submissions_df['article headline'].isna().sum())
print(submissions_df['article body'].isna().sum())
print(comments_df['comment body'].isna().sum())

0
0
0


### Simple text preprocessing

In [18]:
def preprocess(sentence):

    # No lowercasing since upper-case words will indicate sentiment (anger or joy)
    # Also no punctuation removal since ! and ? can indicate sentiment

    # Whitespace removal
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")
    
    return sentence

In [19]:
submissions_df['article headline'] = submissions_df['article headline'].apply(preprocess)
submissions_df['article body'] = submissions_df['article body'].apply(preprocess)
comments_df['comment body'] = comments_df['comment body'].apply(preprocess)

In [20]:
submissions_df

Unnamed: 0,submission id,subreddit,article headline,article body,bias
0,l6a0q7,liberal,"Republicans now 'shocked, shocked' that there'...","© Greg Nash Republicans now 'shocked, shocked'...",left
1,jxxs8b,liberal,Georgia certifies election results confirming ...,Georgia Secretary of State Ben Raffensperger h...,left
2,kuscob,liberal,Report: QAnon Congresswoman Was Live-Tweeting ...,"Domestic Terrorist: Rep. Lauren Boebert, a new...",left
3,j2lufw,liberal,"More than 175 current, former law enforcement ...",EXCLUSIVE: More than 175 current and former la...,left
4,l8m3a8,liberal,GOP group launches billboards demanding Cruz a...,GOP campaigners have called on senators Ted Cr...,left
...,...,...,...,...,...
798,jps7p4,hillaryclinton,Rethinking the causes of Hillary Clinton’s 201...,"Nov. 7, 2020, 11:08 a.m. ET Nov. 7, 2020, 11:0...",left
799,kozosp,shitliberalssay,Just work 70 hours a week smh,This thread has been locked by the moderators ...,right
800,l7rljm,shitliberalssay,What did he learn from this pandemic?!?!,What did he learn from this pandemic?!?!,right
801,ltnskm,shitliberalssay,I am once again asking Charlotte Clymer to shu...,I am once again asking Charlotte Clymer to shu...,right


In [21]:
comments_df

Unnamed: 0,comment id,submission id,subreddit,comment body,bias
0,gkzccbm,l6a0q7,liberal,"Hey Republican geniuses, I'll bet you were als...",left
1,gkzg91o,l6a0q7,liberal,The deficit exploded after the republican tax ...,left
2,gkzfown,l6a0q7,liberal,The Republican Party is a fucking cancer on ou...,left
3,gkz73xz,l6a0q7,liberal,"I wish I had gold to give you, just for the ti...",left
4,gkzhm11,l6a0q7,liberal,"it's not these politicians that really bug me,...",left
...,...,...,...,...,...
67703,g10cvt6,i74pka,shitliberalssay,But your landlord gets to choose a lawyer to h...,right
67704,g11jkic,i74pka,shitliberalssay,"Ah yes, the pitfalls. Getting destroyed by CIA...",right
67705,g11qzfq,i74pka,shitliberalssay,That’s a pretty classist approach tbh consider...,right
67706,g10ncu2,i74pka,shitliberalssay,"Damn, I didn't know I was so rich on my 10th b...",right


### Sentiment analysis

In [22]:
subreddits = ['liberal', 'democrats', 'conservative', 'republicans', 'obama', 'hillaryclinton', 'shitliberalssay']

In [23]:
# We will store only the compound (overall) sentiment

results = {}

for subreddit in subreddits:
    results[subreddit] = {}
    results[subreddit]['article headlines'] = []
    results[subreddit]['article bodies'] = []
    results[subreddit]['comment bodies'] = []

In [24]:
sia = SentimentIntensityAnalyzer()

for i, row in submissions_df.iterrows():
    subreddit = row['subreddit']
    headline = row['article headline']
    body = row['article body']
    results[subreddit]['article headlines'].append(sia.polarity_scores(headline)['compound'])
    results[subreddit]['article bodies'].append(sia.polarity_scores(body)['compound'])

for i, row in comments_df.iterrows():
    subreddit = row['subreddit']
    comment = row['comment body']
    results[subreddit]['comment bodies'].append(sia.polarity_scores(comment)['compound'])

In [25]:
for subreddit in subreddits:
    print(subreddit)
    headline_sentiments = results[subreddit]['article headlines']
    article_body_sentiments = results[subreddit]['article bodies']
    comment_sentiments = results[subreddit]['comment bodies']
    print(f"Headline sentiment: {sum(headline_sentiments) / len(headline_sentiments)}")
    print(f"Article body sentiment: {sum(article_body_sentiments) / len(article_body_sentiments)}")
    print(f"Comment sentiment: {sum(comment_sentiments) / len(comment_sentiments)}")
    print()

liberal
Headline sentiment: -0.127591472868217
Article body sentiment: -0.02572480620155042
Comment sentiment: -0.06145019904458616

democrats
Headline sentiment: -0.16293103448275864
Article body sentiment: 0.38970689655172425
Comment sentiment: 0.03598296022201663

conservative
Headline sentiment: -0.05971562500000001
Article body sentiment: -4.374999999996743e-06
Comment sentiment: 0.019022813248099398

republicans
Headline sentiment: -0.02522911392405063
Article body sentiment: 0.1171746835443038
Comment sentiment: -0.06247160493827159

obama
Headline sentiment: 0.10161307692307692
Article body sentiment: 0.5443384615384614
Comment sentiment: 0.30162500000000014

hillaryclinton
Headline sentiment: 0.03828181818181818
Article body sentiment: 0.5253062937062937
Comment sentiment: 0.0345304347826087

shitliberalssay
Headline sentiment: -0.41447500000000004
Article body sentiment: -0.33492500000000003
Comment sentiment: -0.015377812853373501

