<a href="https://colab.research.google.com/github/fawazshah/Reddit-Analysis/blob/main/4_sentiment_vocab_overlap_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from collections import Counter
import nltk
import pandas as pd

nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




### Loading data

In [2]:
submissions_lib_dem_con_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_liberal_democrats_conservative.tsv'
submissions_lib_dem_con_df = pd.read_csv(submissions_lib_dem_con_url, sep='\t')

comments_lib_dem_con_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_liberal_democrats_conservative.tsv'
comments_lib_dem_con_df = pd.read_csv(comments_lib_dem_con_url, sep='\t')

submissions_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_republican.tsv'
submissions_rep_df = pd.read_csv(submissions_rep_url, sep='\t')

comments_rep_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_republican.tsv'
comments_rep_df = pd.read_csv(comments_rep_url, sep='\t')

submissions_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
submissions_ob_clin_sls_df = pd.read_csv(submissions_ob_clin_sls_url, sep='\t')

comments_ob_clin_sls_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_obama_hillaryclinton_shitliberalssay.tsv'
comments_ob_clin_sls_df = pd.read_csv(comments_ob_clin_sls_url, sep='\t')

submissions_libertarian_sfp_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/submissions_top300_year_libertarian_sandersforpresident.tsv'
submissions_libertarian_sfp_df = pd.read_csv(submissions_libertarian_sfp_url, sep='\t')

comments_libertarian_sfp_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/assembled-data/comments_top300_year_libertarian_sandersforpresident.tsv'
comments_libertarian_sfp_df = pd.read_csv(comments_libertarian_sfp_url, sep='\t')

In [3]:
submissions_df = pd.concat([submissions_lib_dem_con_df, submissions_rep_df, submissions_ob_clin_sls_df, submissions_libertarian_sfp_df], ignore_index=True)
comments_df = pd.concat([comments_lib_dem_con_df, comments_rep_df, comments_ob_clin_sls_df, comments_libertarian_sfp_df], ignore_index=True)

In [4]:
print(f"No. submissions: {len(submissions_df)}")
print(f"No. comments: {len(comments_df)}")

No. submissions: 992
No. comments: 50609


### Data checking

In [5]:
# Removing NAs

print(submissions_df['article headline'].isna().sum())
print(submissions_df['article body'].isna().sum())
print(comments_df['comment body'].isna().sum())

0
0
1


In [6]:
comments_df.dropna(subset=['comment body'], inplace=True)

In [7]:
# Checking all submission ids in comments_df map to a submission in submissions_df

submission_ids = list(submissions_df['submission id'])

num_errors = 0
for i, row in comments_df.iterrows():
    if row['submission id'] not in submission_ids:
        num_errors += 1
print(f"Num integrity errors: {num_errors}")

Num integrity errors: 0


### Fixing left/right class imbalance

In [8]:
print(submissions_df['bias'].value_counts())
print(comments_df['bias'].value_counts())

left     600
right    392
Name: bias, dtype: int64
right    44631
left      5977
Name: bias, dtype: int64


Right is minority class in articles, however left is minority class in comments. We can't remove class imbalance independently in articles and in comments, since for any article we want to make sure all its comments are still in the comments dataset. Thus we fix class imbalance in comments by trimming all classes down to the size of the minority class, and then removing all articles whose comments are no longer present. Thus articles won't be exactly balanced.

We tried other way around, but balancing articles causes comments to HEAVILY skew towards right (48,000 vs 2000 comments)

In [9]:
comments_left_df = comments_df[comments_df['bias'] == 'left']
comments_right_df = comments_df[comments_df['bias'] == 'right']

# Undersample right class in comments to match left class

left_count = len(comments_left_df)
comments_right_under_df = comments_right_df.sample(left_count)

comments_df = pd.concat([comments_left_df, comments_right_under_df], ignore_index=True)

print(comments_df['bias'].value_counts())

left     5977
right    5977
Name: bias, dtype: int64


In [10]:
submissions_to_keep = Counter(set(comments_df['submission id']))
all_submission_ids = Counter(list(submissions_df['submission id']))
submissions_to_drop = all_submission_ids - submissions_to_keep

indices_to_drop = submissions_df[submissions_df['submission id'].isin(submissions_to_drop)].index
submissions_df.drop(indices_to_drop, inplace=True)
submissions_df.reset_index(drop=True, inplace=True)

In [11]:
print(submissions_df['bias'].value_counts())

left     419
right    387
Name: bias, dtype: int64


In [12]:
print(len(submissions_df))
print(len(comments_df))

806
11954


### Simple text preprocessing

In [13]:
def preprocess(sentence):

    # No lowercasing since upper-case words will indicate sentiment (anger or joy)
    # Also no punctuation removal since ! and ? can indicate sentiment

    # Whitespace removal
    whitespace = '''\n\t'''

    for ch in sentence: 
        if ch in whitespace:
            sentence = sentence.replace(ch, " ")
    
    return sentence

In [14]:
submissions_df['article headline'] = submissions_df['article headline'].apply(preprocess)
submissions_df['article body'] = submissions_df['article body'].apply(preprocess)
comments_df['comment body'] = comments_df['comment body'].apply(preprocess)

In [15]:
submissions_df

Unnamed: 0,submission id,subreddit,article headline,article body,bias
0,l6a0q7,liberal,"Republicans now 'shocked, shocked' that there'...","© Greg Nash Republicans now 'shocked, shocked'...",left
1,jxxs8b,liberal,Georgia certifies election results confirming ...,Georgia Secretary of State Ben Raffensperger h...,left
2,kuscob,liberal,Report: QAnon Congresswoman Was Live-Tweeting ...,"Domestic Terrorist: Rep. Lauren Boebert, a new...",left
3,j2lufw,liberal,"More than 175 current, former law enforcement ...",EXCLUSIVE: More than 175 current and former la...,left
4,l8m3a8,liberal,GOP group launches billboards demanding Cruz a...,GOP campaigners have called on senators Ted Cr...,left
...,...,...,...,...,...
801,hr3aiz,sandersforpresident,Study Shows 5.4 Million Have Lost Insurance Am...,Amid the worst public health crisis in a centu...,left
802,lz7cve,sandersforpresident,Bernie — also known as Mr. The Struggle Continues,We use cookies on our websites for a number of...,left
803,jvx3os,sandersforpresident,Medicare for All backers won in safe Democrati...,The votes were still coming in when the Democr...,left
804,indmby,sandersforpresident,Bernie Sanders Says Country Must Get Ready for...,Bernie Sanders is sounding the alarm. The Verm...,left


In [16]:
comments_df

Unnamed: 0,comment id,submission id,subreddit,comment body,bias
0,gkzccbm,l6a0q7,liberal,"Hey Republican geniuses, I'll bet you were als...",left
1,gkzg91o,l6a0q7,liberal,The deficit exploded after the republican tax ...,left
2,gkzfown,l6a0q7,liberal,The Republican Party is a fucking cancer on ou...,left
3,gkz73xz,l6a0q7,liberal,"I wish I had gold to give you, just for the ti...",left
4,gkzhm11,l6a0q7,liberal,"it's not these politicians that really bug me,...",left
...,...,...,...,...,...
11949,galze45,jkuqjq,libertarian,Produce the tape. The internet has many copies...,right
11950,gvtbfig,my5wni,libertarian,I’d rather see fed legalization too with this ...,right
11951,gj68for,kwqevc,libertarian,How does it feel to be objectively *that* wron...,right
11952,ginmbaz,ktge5g,conservative,Back to land lines and radio I guess...,right


### Sentiment analysis

In [17]:
subreddits = [
    'liberal',
    'democrats',
    'conservative',
    'republican',
    'obama',
    'hillaryclinton',
    'shitliberalssay',
    'libertarian',
    'sandersforpresident'
]

In [18]:
# We will store only the compound (overall) sentiment

results = {}

for subreddit in subreddits:
    results[subreddit] = {}
    results[subreddit]['article headlines'] = []
    results[subreddit]['article bodies'] = []
    results[subreddit]['comment bodies'] = []

In [19]:
sia = SentimentIntensityAnalyzer()

for i, row in submissions_df.iterrows():
    subreddit = row['subreddit']
    headline = row['article headline']
    body = row['article body']
    results[subreddit]['article headlines'].append(sia.polarity_scores(headline)['compound'])
    results[subreddit]['article bodies'].append(sia.polarity_scores(body)['compound'])

for i, row in comments_df.iterrows():
    subreddit = row['subreddit']
    comment = row['comment body']
    results[subreddit]['comment bodies'].append(sia.polarity_scores(comment)['compound'])

In [20]:
for subreddit in subreddits:
    print(subreddit)
    headline_sentiments = results[subreddit]['article headlines']
    article_body_sentiments = results[subreddit]['article bodies']
    comment_sentiments = results[subreddit]['comment bodies']
    print(f"Headline sentiment: {sum(headline_sentiments) / len(headline_sentiments)}")
    print(f"Article body sentiment: {sum(article_body_sentiments) / len(article_body_sentiments)}")
    print(f"Comment sentiment: {sum(comment_sentiments) / len(comment_sentiments)}")
    print()

liberal
Headline sentiment: -0.12600352941176465
Article body sentiment: -0.04648666666666667
Comment sentiment: -0.06488906518010307

democrats
Headline sentiment: -0.12061851851851853
Article body sentiment: 0.3994111111111111
Comment sentiment: 0.03578343653250772

conservative
Headline sentiment: -0.06153121019108282
Article body sentiment: -0.011707643312101897
Comment sentiment: 0.019313163547100318

republican
Headline sentiment: 0.012752777777777771
Article body sentiment: 0.17397222222222222
Comment sentiment: 0.018928571428571423

obama
Headline sentiment: 0.060933333333333325
Article body sentiment: 0.4760060606060606
Comment sentiment: 0.2396809523809524

hillaryclinton
Headline sentiment: -0.0037677966101694943
Article body sentiment: 0.5367000000000001
Comment sentiment: 0.036123958333333324

shitliberalssay
Headline sentiment: -0.41447500000000004
Article body sentiment: -0.33492500000000003
Comment sentiment: -0.036435897435897445

libertarian
Headline sentiment: -0.096

### Further preprocessing

Now we perform further text preprocessing before vocab analysis

In [21]:
# Text preprocessing preparation

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

nltk.download('wordnet')
nltk.download('stopwords')

# required for tokenization
nltk.download('punkt')

# required for POS tagging
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [22]:
def preprocess(sentence):

    # Lowercase
    sentence = sentence.lower()

    # Punctuation removal
    punctuations = '''!()-—[]{};:'"“”‘’\,<>./?@#$%^&*_~'''

    for ch in sentence: 
        if ch in punctuations: 
            sentence = sentence.replace(ch, "")

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    remaining_words = [word for word in sentence.split() if not word in stop_words]

    sentence = " ".join(remaining_words)

    # Lemmatization
    lemmatized_words = []

    # In order to lemmatise we must first POS-tag each sentence
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)

    for word, tag in tagged:
        pos = nltk_tag_to_wordnet_tag(tag) 
        if pos is not None:
            word = lemmatizer.lemmatize(word, pos=pos)

        lemmatized_words.append(word)

    sentence = " ".join(lemmatized_words)
    
    return sentence

In [23]:
submissions_df['article headline'] = submissions_df['article headline'].apply(preprocess)
submissions_df['article body'] = submissions_df['article body'].apply(preprocess)
comments_df['comment body'] = comments_df['comment body'].apply(preprocess)

In [24]:
# One-hot encode labels
# left == 0
# right == 1

def encode_labels(label):
    if label == "left":
        return 0
    else:
        return 1

submissions_df['bias'] = submissions_df['bias'].apply(encode_labels)
comments_df['bias'] = comments_df['bias'].apply(encode_labels)

In [25]:
submissions_df

Unnamed: 0,submission id,subreddit,article headline,article body,bias
0,l6a0q7,liberal,republican shock shocked there deficit hahahah...,© greg nash republican shock shocked there def...,0
1,jxxs8b,liberal,georgia certifies election result confirm bide...,georgia secretary state ben raffensperger hold...,0
2,kuscob,liberal,report qanon congresswoman livetweeting nancy ...,domestic terrorist rep lauren boebert newly el...,0
3,j2lufw,liberal,175 current former law enforcement official en...,exclusive 175 current former law enforcement o...,0
4,l8m3a8,liberal,gop group launch billboard demand cruz hawley ...,gop campaigner call senator ted cruz josh hawl...,0
...,...,...,...,...,...
801,hr3aiz,sandersforpresident,study show 54 million lose insurance amid pand...,amid bad public health crisis century devastat...,0
802,lz7cve,sandersforpresident,bernie also know mr struggle continue,use cooky websites number purpose include anal...,0
803,jvx3os,sandersforpresident,medicare backer safe democratic district trump...,vote still come democratic establishment set n...,0
804,indmby,sandersforpresident,bernie sander say country must get ready trump...,bernie sander sound alarm vermont senator warn...,0


In [26]:
comments_df

Unnamed: 0,comment id,submission id,subreddit,comment body,bias
0,gkzccbm,l6a0q7,liberal,hey republican genius ill bet also unaware tru...,0
1,gkzg91o,l6a0q7,liberal,deficit explode republican tax cut single one ...,0
2,gkzfown,l6a0q7,liberal,republican party fuck cancer country need finish,0
3,gkz73xz,l6a0q7,liberal,wish gold give title alone,0
4,gkzhm11,l6a0q7,liberal,politician really bug reaction predictable foo...,0
...,...,...,...,...,...
11949,galze45,jkuqjq,libertarian,produce tape internet many copy video show cop...,1
11950,gvtbfig,my5wni,libertarian,id rather see fed legalization clause maybe li...,1
11951,gj68for,kwqevc,libertarian,feel objectively wrong youre show everyone und...,1
11952,ginmbaz,ktge5g,conservative,back land line radio guess,1


### Remove empty comments

In [27]:
empty_comments = comments_df[comments_df['comment body'] == ''].index
print(len(empty_comments))

31


In [28]:
comments_df.drop(empty_comments, inplace=True)

### Vocab overlap

In [29]:
article_headline_vocab = []
article_body_vocab = []
comment_vocab = []

for i, row in submissions_df.iterrows():
    article_headline = row['article headline']
    article_body = row['article body']
    for word in article_headline.split():
        article_headline_vocab.append(word)
    for word in article_body.split():
        article_body_vocab.append(word)

for i, row in comments_df.iterrows():
    comment_body = row['comment body']
    for word in comment_body.split():
        comment_vocab.append(word)

print(len(article_headline_vocab))
print(len(article_body_vocab))
print(len(comment_vocab))

8814
275713
213610


In [30]:
article_headline_multiset = Counter(article_headline_vocab)
article_body_multiset = Counter(article_body_vocab)
comment_multiset = Counter(comment_vocab)

In [31]:
# Computing Jaccard distances

headline_body_intersect = list((article_headline_multiset & article_body_multiset).elements())
headline_comment_intersect = list((article_headline_multiset & comment_multiset).elements())
body_comment_intersect = list((article_body_multiset & comment_multiset).elements())

print(len(headline_body_intersect))
print(len(headline_comment_intersect))
print(len(body_comment_intersect))

headline_body_union = list((article_headline_multiset | article_body_multiset).elements())
headline_comment_union = list((article_headline_multiset | comment_multiset).elements())
body_comment_union = list((article_body_multiset | comment_multiset).elements())

print(len(headline_body_union))
print(len(headline_comment_union))
print(len(body_comment_union))

8597
8319
145517
275930
214105
343806


In [32]:
headline_body_jaccard = len(headline_body_intersect) / len(headline_body_union)
headline_comment_jaccard = len(headline_comment_intersect) / len(headline_comment_union)
body_comment_jaccard = len(body_comment_intersect) / len(body_comment_union)

print(f"Headline & article body Jaccard distance: {headline_body_jaccard}")
print(f"Headline and comment body Jaccard distance: {headline_comment_jaccard}")
print(f"Article body and comment body Jaccard distance: {body_comment_jaccard}")

Headline & article body Jaccard distance: 0.031156452723516834
Headline and comment body Jaccard distance: 0.03885476752060905
Article body and comment body Jaccard distance: 0.423253230019255


### Save data

In [33]:
submissions_df.to_csv('submissions_preprocessed.tsv', sep='\t', index=False)
comments_df.to_csv('comments_preprocessed.tsv', sep='\t', index=False)