# Sentiment Analysis

In [7]:
# Load libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import ast, datetime, json, os, re, zstandard
from nltk.corpus import stopwords
from textblob import TextBlob
from zst_processor import read_lines_zst, write_line_zst


In [None]:
subreddits = ["Conservative", "progressive",
              "democrats", "Republican",
              "NeutralPolitics", "PoliticalDiscussion", "politics"]

## Emotions using NRC Lexicon

### Clean Up NRC Emotion Dictionary

In [None]:
# Load NRC emotion lexicon from NRC-Emotion.txt
nrc = {}
with open('NRC-Emotion.txt', 'r') as f:
    for line in f:
        words, emotion, value = line.strip().split('\t')

        # Replace - with spaces
        words = words.replace('-', ' ').replace(",", ' ').replace("  ", " ")

        # Break words into a list of words
        words = words.split(' ')

        for word in words:
            if word not in nrc.keys():
                nrc[word] = {}
            nrc[word][emotion] = int(value)

# Save nrc to a json file as lexicon_clean.json
with open('lexicon_clean.json', 'w') as f:
    json.dump(nrc, f)

### Calculate comments emotions

In [None]:
# Function to add the sentiment of the comments in a submission
def add_sentiment_comments(input_comments: list, output_comments: list, lexicon: dict) ->  None:
    """Add the sentiment of the comments in a submission"""

    # Loop through input paths
    for in_comment, out_comment in zip(input_comments, output_comments):

        # Create the zst handler
        handle = zstandard.ZstdCompressor().stream_writer(open(out_comment, 'wb'))

        # Save the data to zst file
        with open(out_comment, mode="w", newline="") as file:

            for line, file_bytes_processed in read_lines_zst(in_comment):
                obj = json.loads(line)

                # Skip if body doesn't exist
                if 'body' not in obj:
                    continue

                # Get body of comment
                body = obj['body']

                # Skip if body is deleted or removed
                if (body == 'deleted') or (body == 'removed'):
                    continue

                # Skip if score or ups+downs is too low
                if 'score' in obj:
                    try: score = int(obj['score'])
                    except: score = 0
                    if score <= 5: continue
                else:
                    ups = obj.get('ups',0)
                    try: ups = int(ups)
                    except: ups = 0
                    downs = obj.get('downs',0)
                    try: downs = int(downs)
                    except: downs = 0
                    if (ups-downs) <= 5: continue

                # Get polarity and subjectivity of body
                polarity = TextBlob(body).sentiment.polarity
                subjectivity = TextBlob(body).sentiment.subjectivity

                # Calculate emotions in the comment using NRC emotion lexicon
                emotions = {'fear': 0, 'anger': 0, 'anticip': 0, 'trust': 0,
                            'surprise': 0, 'sadness': 0, 'joy': 0, 'disgust': 0,
                            'positive': 0, 'negative': 0}
                for word in body.split():
                    if word in nrc_lexicon.keys():
                        for emotion in nrc_lexicon[word].keys():
                            emotions[emotion] = emotions.get(emotion, 0) + nrc_lexicon[word][emotion]

                # Add sentiment and emotion to object
                obj['polarity'] = polarity
                obj['subjectivity'] = subjectivity
                obj['emotions'] = emotions

                # Remove body
                del obj['body']

                # Write the data to the zst file
                new_line = json.dumps(obj)

                write_line_zst(handle, new_line)
    
    return

In [None]:
# Load nrc_lexicon
nrc_lexicon = json.load(open('lexicon_clean.json', 'r'))

In [None]:
# Create input and output paths
input_comments = [f"data/{s}/{s}_comments_clean.zst" for s in subreddits]
output_comments = [f"data/sentiment/{s}_comments_sentiment.zst" for s in subreddits]

# Add sentiment to comments
add_sentiment_comments(input_comments, output_comments, nrc_lexicon)


## Calculate Submissions Polarity

Assess the homogeneity of the discussions within each subreddit.

* Measure the sentiment (positive, negative, or neutral) of (most engaged) comments on a specific post in each subreddit.
Then, aggregate those sentiments, using upvotes and downvotes as weights, to get a sentiment score for each post. Finally, aggregate the sentiment scores of all posts in a subreddit to get a sentiment score for each subreddit on a specific topic.
(consider using sentiment entropy)

In [None]:
# Calculate average sentiment towards each submission 
def calculate_sentiment_submission(input_comments: list, output_comments: list) -> None:
    """Calculate average sentiment towards each submission"""

    # Loop through input paths
    for in_comment, out_comment in zip(input_comments, output_comments):

        # Create the zst handler
        handle = zstandard.ZstdCompressor().stream_writer(open(out_comment, 'wb'))

        # Save the data to zst file
        with open(out_comment, mode="w", newline="") as file:

            submissions = {}

            for line, file_bytes_processed in read_lines_zst(in_comment):
                obj = json.loads(line)

                # Skip if body doesn't exist
                if 'sentiment' not in obj or 'link_id' not in obj:
                    continue

                # Calculate score if ups and downs exist
                if obj.get('ups', '0') != '0' or obj.get('downs', '0') != '0':

                    # Give it a higher weight the more votes it has
                    votes = obj['ups'] + obj['downs']
                    if votes == 0: interactions = 1
                    sentiment = obj['sentiment'] * votes
                    interactions = obj['ups'] + abs(obj['downs'])
                    
                # Otherwise, use the score
                else:
                    interactions = max(obj['score'], 1)
                    sentiment = obj['sentiment'] * interactions
                
                # Add sentiment to object
                if obj['link_id'] not in submissions:
                    submissions[obj['link_id']] = [sentiment, interactions]
                else:
                    submissions[obj['link_id']][0] += sentiment
                    submissions[obj['link_id']][1] += interactions

            # Save the data to zst file
            for link_id, (sentiment, interactions) in submissions.items():
                obj = {'link_id': link_id,
                       'sentiment': sentiment,
                       'interactions': interactions}
                new_line = json.dumps(obj)
                write_line_zst(handle, new_line)

    return

In [None]:
# Create input and output paths
output_overall_sentiment = [f"analysis/sentiment/{s}_submissions_overall_sentiment.zst" for s in subreddits]

In [None]:
calculate_sentiment_submission(output_comments, output_overall_sentiment)

## Get Comments Polarity

In [16]:
input_comments_sentiment = [f"data/{s}/{s}_comments_sentiment.zst" for s in subreddits if s != 'politics']
output_submission_sentiment = [f"analysis/sentiment/{s}_submissions_sentiment.zst" for s in subreddits if s != 'politics']

In [94]:
# Function to calculate average emotion towards each submission
def calculate_emotion_submission(input_comments_sentiment: list, output_submission_sentiment: list) -> None:
    
    emotions = {'fear': 0, 'anger': 0, 'anticip': 0, 'trust': 0,
            'surprise': 0, 'sadness': 0, 'joy': 0, 'disgust': 0,
            'positive': 0, 'negative': 0}

    submissions = {}

    # Loop through input paths
    for in_comment, out_submission in zip(input_comments_sentiment, output_submission_sentiment):

        # Create the zst handler
        handle = zstandard.ZstdCompressor().stream_writer(open(out_submission, 'wb'))

        # Save the data to zst file
        with open(out_submission, mode="w", newline="") as file:

            for line, file_bytes_processed in read_lines_zst(in_comment):
                obj = json.loads(line)

                # Skip if body doesn't exist
                if 'link_id' not in obj:
                    continue
                
                # Calculate score if ups and downs exist
                if obj.get('ups', '') == '': ups = 0
                else: ups = int(obj.get('ups', ''))
                if obj.get('downs', '') == '': downs = 0
                else: downs = int(obj.get('downs', ''))

                if ups != 0 or downs != 0:
                    
                    interactions = ups + downs
                    if interactions == 0: interactions = 1
                    
                # Otherwise, use the score
                else:
                    interactions = max(obj['score'], 1)
                
                # Add polarity, subjectivity, and emotions to object
                if obj['link_id'] not in submissions:
                    submissions[obj['link_id']] = {'polarity': 0,
                                                   'subjectivity': 0,
                                                   'emotions': emotions.copy(),
                                                   'interactions': 0}
                
                # Calculate average polarity, subjectivity, and emotions
                submissions[obj['link_id']]['polarity'] += obj['polaritiy'] * interactions
                submissions[obj['link_id']]['subjectivity'] += obj['subjectivity'] * interactions

                for e in emotions:
                    submissions[obj['link_id']]['emotions'][e] += obj['emotions'][e] * interactions
                
                # Add interactions
                submissions[obj['link_id']]['interactions'] += interactions

        # Save the data to zst file, including link_id
        for link_id, obj in submissions.items():
            obj['link_id'] = link_id
            new_line = json.dumps(obj)
            write_line_zst(handle, new_line)

    return

In [95]:
calculate_emotion_submission(input_comments_sentiment, output_submission_sentiment)

In [59]:
output_submissions_classified = [f"data/sentiment/{s}_submissions_classified.zst" for s in subreddits if s != 'politics']

In [76]:
# For list of tuples, find tuple with highest value in second element and return first element
def find_max_tuple(tuples: list) -> tuple:
    """Find tuple with highest value in second element"""

    # Initialize max tuple
    max_tuple = ('', 0)

    # Loop through tuples
    for t in tuples:

        # Update max tuple if second element is greater than current max
        if t[1] > max_tuple[1]:
            max_tuple = t

    return max_tuple

In [85]:
# Load submissions classified and get topic with highest probability
submissions_classified = {s: {} for s in subreddits if s != 'politics'}

for sub in subreddits[:-1]:

    s_class = f"data/sentiment/{sub}_submissions_classified.zst"

    # Load topic distribution from each submission classified
    for line, file_bytes_processed in read_lines_zst(s_class):

            # Convert the line to a json object
            obj = json.loads(line)
    
            # Get link_id
            link_id = obj['id']

            # Find topic with highest probability
            topic_dist = ast.literal_eval(obj['topic_dist'])
            topic = find_max_tuple(topic_dist)[0]

            # Add to dictionary
            submissions_classified[sub][link_id] = {'topic': topic}


In [107]:
# Load submissions sentiment and get average sentiment
submissions_sentiment = {s: {} for s in subreddits if s != 'politics'}

for sub in subreddits[:-1]:
    
    s_sent = f"analysis/sentiment/{sub}_submissions_sentiment.zst"

    # Load topic distribution from each submission classified
    for line, file_bytes_processed in read_lines_zst(s_sent):

            # Convert the line to a json object
            obj = json.loads(line)

            # Get link_id
            link_id = obj['link_id'].split('_')[-1]

            # Get emotions, polarity, and subjectivity
            emotions = obj['emotions']
            polarity = obj['polarity']
            subjectivity = obj['subjectivity']

            # Add to dictionary
            submissions_sentiment[sub][link_id] = {'emotions': emotions,
                                                    'polarity': polarity,
                                                    'subjectivity': subjectivity}

In [116]:
# Iterate over all submissions_classified in each subreddit, and calcuate average sentiment and emotions for each topic
submissions_classified_sentiment = {s: {} for s in subreddits if s != 'politics'}
count = 0
for sub in subreddits[:-1]:
    count += 1
    # Loop through submissions classified
    for link_id, obj in submissions_classified[sub].items():

        count += 1
        if link_id not in submissions_sentiment[sub]:
            continue

        # Get topic
        topic = obj['topic']

        # Get sentiment and emotions from submissions sentiment
        sentiment = submissions_sentiment[sub][link_id]
        emotions = sentiment['emotions']
        polarity = sentiment['polarity']
        subjectivity = sentiment['subjectivity']

        # Add to dictionary
        if topic not in submissions_classified_sentiment[sub]:
            submissions_classified_sentiment[sub][topic] = {'emotions': emotions,
                                                            'polarity': polarity,
                                                            'subjectivity': subjectivity,
                                                            'count': 1}
        else:
            submissions_classified_sentiment[sub][topic]['emotions'] = {k: v + submissions_classified_sentiment[sub][topic]['emotions'][k] for k, v in emotions.items()}
            submissions_classified_sentiment[sub][topic]['polarity'] += polarity
            submissions_classified_sentiment[sub][topic]['subjectivity'] += subjectivity
            submissions_classified_sentiment[sub][topic]['count'] += 1

In [121]:
# For each topic, plot the positive and negative emotions for each subreddit
topics = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}}
for sub in subreddits[:-1]:

    for t in range(7):
        positive = submissions_classified_sentiment[sub][t]['emotions']['positive']
        negative = submissions_classified_sentiment[sub][t]['emotions']['negative']
        count = submissions_classified_sentiment[sub][t]['count']
        topics[t][sub] = {'positive': positive/count, 'negative': negative/count}


## Create Plots

In [170]:
topic_mapping = {0: 'Judicial System', 1: 'Political Figures & Investigations', 2: 'International Affairs', 3: 'State Government', 4: 'Federal Government', 5: 'Social Issues', 6: 'Other'}

In [177]:
# Remove from the keys the subreddits that are not needed (politics)
topics_renamed = {k: {k2: v2 for k2, v2 in v.items() if k2 not in ['politics']} for k, v in topics.items()}

# Rename they keys
rename_dict = {'Conservative': 'Right-Leaning', 'Republican': 'Right-Leaning',
               'democrats': 'Left-Leaning', 'progressive': 'Left-Leaning',
               'NeutralPolitics': 'Neutral', 'PoliticalDiscussion': 'Neutral'}
topics_renamed = {k: {rename_dict[k2]: v2 for k2, v2 in v.items()} for k, v in topics_renamed.items()}

In [217]:
# Plot each topic as a bar chart
for t in range(6):

    values = topics_renamed[t]
    df = pd.DataFrame(values).T
    normalization_factor = df["positive"] + df["negative"]
    df["overall"] = (df["positive"] - df["negative"]) / normalization_factor
    
    # Plot the bar chart using seaborn
    plt.figure(figsize=(12,8))

    # Set the colors
    colors = ["#2ca4c4" if val >= 0 else "#e8f6b1" for val in df["overall"]]

    # Plot positive and negative emotions as bar chart horizontally using seaborn
    sns.barplot(y=df.index, x=df["overall"], palette=colors)

    # Set the title and axis labels
    title = f"{topic_mapping[t]}"
    plt.title(title)
    plt.ylabel("Polical Leaning")
    plt.xlabel("Polarity")

    # Make font size larger and plot smaller
    plt.rc('font', size=20)
    plt.rc('axes', titlesize=20)

    # Find max absolute value in 'overall' column
    max_abs = max(abs(df['overall']))
    x_lim = 1.05 * max_abs

    # Center the x-axis
    plt.xlim(-x_lim, x_lim)

    # Add line on 0
    plt.axvline(x=0, color='gray')

    # Save the image
    plt.tight_layout(pad=1)
    plt.savefig(f"analysis/sentiment/plots/topic_{t}_positive_negative_emotions.png")
    #plt.show()
    plt.close()


State Government
Social Issues


In [47]:
for t in range(6):

    values = topics_renamed[t]
    df = pd.DataFrame(values).T
    normalization_factor = df["positive"] + df["negative"]
    df["overall"] = (df["positive"] - df["negative"]) / normalization_factor

    print(topic_mapping[t])
    print(df['overall'])

Judicial System
Right-Leaning   -0.002790
Left-Leaning     0.026752
Neutral          0.023971
Name: overall, dtype: float64
Political Figures & Investigations
Right-Leaning    0.005511
Left-Leaning     0.039028
Neutral         -0.001507
Name: overall, dtype: float64
International Affairs
Right-Leaning    0.027691
Left-Leaning     0.012554
Neutral          0.134846
Name: overall, dtype: float64
State Government
Right-Leaning    0.028444
Left-Leaning    -0.097354
Neutral          0.003329
Name: overall, dtype: float64
Federal Government
Right-Leaning    0.009953
Left-Leaning     0.025542
Neutral          0.008952
Name: overall, dtype: float64
Social Issues
Right-Leaning   -0.143581
Left-Leaning     0.056836
Neutral         -0.005967
Name: overall, dtype: float64
