<a href="https://colab.research.google.com/github/fawazshah/Reddit-Analysis/blob/main/4_sentiment_vocab_overlap_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from collections import Counter
import nltk
import pandas as pd

nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Loading data

In [42]:
submissions_lib_dem_con_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_liberal_democrats_conservative_republicans.tsv'
submissions_lib_dem_con_rep_df = pd.read_csv(submissions_lib_dem_con_rep_url, sep='\t')

comments_lib_dem_con_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_liberal_democrats_conservative_republicans.tsv'
comments_lib_dem_con_rep_df = pd.read_csv(comments_lib_dem_con_rep_url, sep='\t')

submissions_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
submissions_ob_clin_sls_df = pd.read_csv(submissions_ob_clin_sls_url, sep='\t')

comments_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
comments_ob_clin_sls_df = pd.read_csv(comments_ob_clin_sls_url, sep='\t')

submissions_libertarian_sfp_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_libertarian_sandersforpresident.tsv'
submissions_libertarian_sfp_df = pd.read_csv(submissions_libertarian_sfp_url, sep='\t')

comments_libertarian_sfp_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_libertarian_sandersforpresident.tsv'
comments_libertarian_sfp_df = pd.read_csv(comments_libertarian_sfp_url, sep='\t')

In [43]:
submissions_df = pd.concat([submissions_lib_dem_con_rep_df, submissions_ob_clin_sls_df, submissions_libertarian_sfp_df], ignore_index=True)
comments_df = pd.concat([comments_lib_dem_con_rep_df, comments_ob_clin_sls_df, comments_libertarian_sfp_df], ignore_index=True)

### Data checking

In [44]:
print(submissions_df['article headline'].isna().sum())
print(submissions_df['article body'].isna().sum())
print(comments_df['comment body'].isna().sum())

0
0
1


In [45]:
comments_df.dropna(subset=['comment body'], inplace=True)

### Fixing left/right class imbalance

In [46]:
print(submissions_df['bias'].value_counts())
print(comments_df['bias'].value_counts())

left     605
right    434
Name: bias, dtype: int64
right    79809
left     22164
Name: bias, dtype: int64


Right is minority class in articles, however left is minority class in comments. We can't remove class imbalance independently in articles and in comments, since for any article we want to make sure all its comments are still in the comments dataset. Thus we fix class imbalance in articles by trimming all classes down to the size of the minority class, and then removing all comments associated with the removed articles. Thus comments won't be exactly balanced, but hopefully will be close enough.

In [47]:
submissions_left_df = submissions_df[submissions_df['bias'] == 'left']
submissions_right_df = submissions_df[submissions_df['bias'] == 'right']

# Undersample left class in submissions to match right class

right_count = len(submissions_right_df)
submissions_left_under_df = submissions_left_df.sample(right_count)

submissions_df = pd.concat([submissions_left_under_df, submissions_right_df], ignore_index=True)

print(submissions_df['bias'].value_counts())

right    434
left     434
Name: bias, dtype: int64


In [58]:
submission_ids = list(submissions_df['submission id'])

comments_to_drop = []
for i, row in comments_df.iterrows():
    if row['submission id'] not in submission_ids:
        comments_to_drop.append(i)

comments_df.drop(comments_to_drop, inplace=True)

In [60]:
print(comments_df['bias'].value_counts())

right    43569
left      4256
Name: bias, dtype: int64


### Simple text preprocessing

In [37]:
def preprocess(sentence):

    # No lowercasing since upper-case words will indicate sentiment (anger or joy)
    # Also no punctuation removal since ! and ? can indicate sentiment

    # Whitespace removal
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")
    
    return sentence

In [38]:
submissions_df['article headline'] = submissions_df['article headline'].apply(preprocess)
submissions_df['article body'] = submissions_df['article body'].apply(preprocess)
comments_df['comment body'] = comments_df['comment body'].apply(preprocess)

In [39]:
submissions_df

Unnamed: 0,submission id,subreddit,article headline,article body,bias
0,l6a0q7,liberal,"Republicans now 'shocked, shocked' that there'...","© Greg Nash Republicans now 'shocked, shocked'...",left
1,jxxs8b,liberal,Georgia certifies election results confirming ...,Georgia Secretary of State Ben Raffensperger h...,left
2,kuscob,liberal,Report: QAnon Congresswoman Was Live-Tweeting ...,"Domestic Terrorist: Rep. Lauren Boebert, a new...",left
3,j2lufw,liberal,"More than 175 current, former law enforcement ...",EXCLUSIVE: More than 175 current and former la...,left
4,l8m3a8,liberal,GOP group launches billboards demanding Cruz a...,GOP campaigners have called on senators Ted Cr...,left
...,...,...,...,...,...
1034,hr3aiz,sandersforpresident,Study Shows 5.4 Million Have Lost Insurance Am...,Amid the worst public health crisis in a centu...,left
1035,lz7cve,sandersforpresident,Bernie — also known as Mr. The Struggle Continues,We use cookies on our websites for a number of...,left
1036,jvx3os,sandersforpresident,Medicare for All backers won in safe Democrati...,The votes were still coming in when the Democr...,left
1037,indmby,sandersforpresident,Bernie Sanders Says Country Must Get Ready for...,Bernie Sanders is sounding the alarm. The Verm...,left


In [40]:
comments_df

Unnamed: 0,comment id,submission id,subreddit,comment body,bias
0,gkzccbm,l6a0q7,liberal,"Hey Republican geniuses, I'll bet you were als...",left
1,gkzg91o,l6a0q7,liberal,The deficit exploded after the republican tax ...,left
2,gkzfown,l6a0q7,liberal,The Republican Party is a fucking cancer on ou...,left
3,gkz73xz,l6a0q7,liberal,"I wish I had gold to give you, just for the ti...",left
4,gkzhm11,l6a0q7,liberal,"it's not these politicians that really bug me,...",left
...,...,...,...,...,...
101969,gt8z5yp,mj44yw,sandersforpresident,Non-AMP Link: [There’s a bunch of recent artic...,left
101970,gt9e1a4,mj44yw,sandersforpresident,The companies that don’t pay taxes (like Amazo...,left
101971,gt9fypm,mj44yw,sandersforpresident,“Pay taxes through stocks” is so vague as to b...,left
101972,gt9s4nv,mj44yw,sandersforpresident,"No, he didn’t make $23B in profit. The value o...",left


### Sentiment analysis

In [41]:
subreddits = [
    'liberal',
    'democrats',
    'conservative',
    'republicans',
    'obama',
    'hillaryclinton',
    'shitliberalssay',
    'libertarian',
    'sandersforpresident'
]

In [42]:
# We will store only the compound (overall) sentiment

results = {}

for subreddit in subreddits:
    results[subreddit] = {}
    results[subreddit]['article headlines'] = []
    results[subreddit]['article bodies'] = []
    results[subreddit]['comment bodies'] = []

In [43]:
sia = SentimentIntensityAnalyzer()

for i, row in submissions_df.iterrows():
    subreddit = row['subreddit']
    headline = row['article headline']
    body = row['article body']
    results[subreddit]['article headlines'].append(sia.polarity_scores(headline)['compound'])
    results[subreddit]['article bodies'].append(sia.polarity_scores(body)['compound'])

for i, row in comments_df.iterrows():
    subreddit = row['subreddit']
    comment = row['comment body']
    results[subreddit]['comment bodies'].append(sia.polarity_scores(comment)['compound'])

In [44]:
for subreddit in subreddits:
    print(subreddit)
    headline_sentiments = results[subreddit]['article headlines']
    article_body_sentiments = results[subreddit]['article bodies']
    comment_sentiments = results[subreddit]['comment bodies']
    print(f"Headline sentiment: {sum(headline_sentiments) / len(headline_sentiments)}")
    print(f"Article body sentiment: {sum(article_body_sentiments) / len(article_body_sentiments)}")
    print(f"Comment sentiment: {sum(comment_sentiments) / len(comment_sentiments)}")
    print()

liberal
Headline sentiment: -0.127591472868217
Article body sentiment: -0.02572480620155042
Comment sentiment: -0.06145019904458616

democrats
Headline sentiment: -0.16293103448275864
Article body sentiment: 0.38970689655172425
Comment sentiment: 0.03598296022201663

conservative
Headline sentiment: -0.05971562500000001
Article body sentiment: -4.374999999996743e-06
Comment sentiment: 0.019022813248099398

republicans
Headline sentiment: -0.02522911392405063
Article body sentiment: 0.1171746835443038
Comment sentiment: -0.06247160493827159

obama
Headline sentiment: 0.10161307692307692
Article body sentiment: 0.5443384615384614
Comment sentiment: 0.30162500000000014

hillaryclinton
Headline sentiment: 0.03828181818181818
Article body sentiment: 0.5253062937062937
Comment sentiment: 0.0345304347826087

shitliberalssay
Headline sentiment: -0.41447500000000004
Article body sentiment: -0.33492500000000003
Comment sentiment: -0.015377812853373501

libertarian
Headline sentiment: -0.09663455

### Further preprocessing

Now we perform further text preprocessing before vocab analysis

In [45]:
# Text preprocessing preparation

#stop_words = ["the", "a", "an", "as", "this", "that", "is", "and", "or", "on",
#              "at", "to", "in", "by", "than", "of", "for", "be", "i", "you", 
#              "he", "she", "his", "her", "do", "it", "with"]

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

nltk.download('wordnet')
nltk.download('stopwords')

# required for tokenization
nltk.download('punkt')

# required for POS tagging
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [46]:
def preprocess(sentence):

    # Lowercase
    sentence = sentence.lower()

    # Punctuation removal
    punctuations = '''!()-—[]{};:'"“”‘’\,<>./?@#$%^&*_~'''

    for ch in sentence: 
        if ch in punctuations: 
            sentence = sentence.replace(ch, "")

    # Stop word removal
    remaining_words = [word for word in sentence.split() if not word in stopwords.words()]

    sentence = " ".join(remaining_words)

    # Lemmatization
    lemmatized_words = []

    # In order to lemmatise we must first POS-tag each sentence
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)

    for word, tag in tagged:
        pos = nltk_tag_to_wordnet_tag(tag) 
        if pos is not None:
            word = lemmatizer.lemmatize(word, pos=pos)

        lemmatized_words.append(word)

    sentence = " ".join(lemmatized_words)
    
    return sentence

In [47]:
submissions_df['article headline'] = submissions_df['article headline'].apply(preprocess)
submissions_df['article body'] = submissions_df['article body'].apply(preprocess)
comments_df['comment body'] = comments_df['comment body'].apply(preprocess)

KeyboardInterrupt: ignored

In [None]:
submissions_df.to_csv('submissions_preprocessed.tsv', sep='\t', index=False)
comments_df.to_csv('comments_preprocessed.tsv', sep='\t', index=False)

In [18]:
submissions_df

Unnamed: 0,submission id,subreddit,article headline,article body,bias
0,l6a0q7,liberal,republican now shock shocked there deficit hah...,© greg nash republican now shock shocked there...,left
1,jxxs8b,liberal,georgia certifies election result confirm bide...,georgia secretary state ben raffensperger hold...,left
2,kuscob,liberal,report qanon congresswoman be livetweeting nan...,domestic terrorist rep lauren boebert newly el...,left
3,j2lufw,liberal,more 175 current former law enforcement offici...,exclusive more 175 current former law enforcem...,left
4,l8m3a8,liberal,gop group launch billboard demand cruz hawley ...,gop campaigner have call senator ted cruz josh...,left
...,...,...,...,...,...
1034,hr3aiz,sandersforpresident,study show 54 million have lose insurance amid...,amid bad public health crisis century devastat...,left
1035,lz7cve,sandersforpresident,bernie also know mr struggle continue,we use cooky our website number purpose includ...,left
1036,jvx3os,sandersforpresident,medicare all backer win safe democratic distri...,vote be still come when democratic establishme...,left
1037,indmby,sandersforpresident,bernie sander say country must get ready trump...,bernie sander sound alarm vermont senator warn...,left


In [19]:
comments_df

Unnamed: 0,comment id,submission id,subreddit,comment body,bias
0,gkzccbm,l6a0q7,liberal,hey republican genius ill bet be also unaware ...,left
1,gkzg91o,l6a0q7,liberal,deficit explode after republican tax cut not s...,left
2,gkzfown,l6a0q7,liberal,republican party fuck cancer our country they ...,left
3,gkz73xz,l6a0q7,liberal,wish have gold give just title alone,left
4,gkzhm11,l6a0q7,liberal,its not these politician really bug me their r...,left
...,...,...,...,...,...
101969,gt8z5yp,mj44yw,sandersforpresident,nonamp link there bunch recent article out the...,left
101970,gt9e1a4,mj44yw,sandersforpresident,company dont pay tax like amazon nike obvious ...,left
101971,gt9fypm,mj44yw,sandersforpresident,pay tax through stock so vague meaningless hed...,left
101972,gt9s4nv,mj44yw,sandersforpresident,no didnt make 23b profit value share increase ...,left


### Vocab overlap

In [20]:
article_headline_vocab = []
article_body_vocab = []
comment_vocab = []

for i, row in submissions_df.iterrows():
    article_headline = row['article headline']
    article_body = row['article body']
    for word in article_headline.split():
        article_headline_vocab.append(word)
    for word in article_body.split():
        article_body_vocab.append(word)

for i, row in comments_df.iterrows():
    comment_body = row['comment body']
    for word in comment_body.split():
        comment_vocab.append(word)

print(len(article_headline_vocab))
print(len(article_body_vocab))
print(len(comment_vocab))

13062
418125
2345412


In [24]:
article_headline_multiset = Counter(article_headline_vocab)
article_body_multiset = Counter(article_body_vocab)
comment_multiset = Counter(comment_vocab)

In [29]:
# Computing Jaccard distances

headline_body_intersect = list((article_headline_multiset & article_body_multiset).elements())
headline_comment_intersect = list((article_headline_multiset & comment_multiset).elements())
body_comment_intersect = list((article_body_multiset & comment_multiset).elements())

print(len(headline_body_intersect))
print(len(headline_comment_intersect))
print(len(body_comment_intersect))

headline_body_union = list((article_headline_multiset | article_body_multiset).elements())
headline_comment_union = list((article_headline_multiset | comment_multiset).elements())
body_comment_union = list((article_body_multiset | comment_multiset).elements())

print(len(headline_body_union))
print(len(headline_comment_union))
print(len(body_comment_union))

12775
12804
386738
418412
2345670
2376799


In [32]:
headline_body_jaccard = len(headline_body_intersect) / len(headline_body_union)
headline_comment_jaccard = len(headline_comment_intersect) / len(headline_comment_union)
body_comment_jaccard = len(body_comment_intersect) / len(body_comment_union)

print(headline_body_jaccard)
print(headline_comment_jaccard)
print(body_comment_jaccard)

0.030532107109738728
0.005458568340815204
0.1627138012091052
