In [18]:
# to be used to load data gotten from get_reddit_data.py

import os
import pandas as pd
import gzip
import json
from textblob import TextBlob

In [19]:
#function to read data in from the compressed files and returns a dataframe holding the data 
def load_json_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json.gz"):
            with gzip.open(os.path.join(directory, filename), 'rt', encoding='utf-8') as f:
                for line in f:
                    data.append(json.loads(line))
    return pd.DataFrame(data)


In [20]:

comments_dir = 'reddit-data/comments'
submissions_dir = 'reddit-data/submissions'

comments_df = load_json_files(comments_dir)
submissions_df = load_json_files(submissions_dir)


# keep the necessary columns
# comments_df = comments_df[['body', 'score', 'subreddit', 'link_id', 'id', 'subreddit_id']]
# submissions_df = submissions_df[['num_comments', 'score', 'id', 'selftext', 'title']]


In [21]:
#some comments dont have a link_id, so we can remove them 
#since we wont be able to link back to the post on which they commneted 
    
#some submissions also have a null score, title and body which is needed for the sentiment analsysis

comments_df = comments_df.dropna(subset=['link_id'])
submissions_df = submissions_df.dropna(subset=['score', 'selftext', 'title'])


In [22]:
def calculate_sentiment(text): #rounds to the nearest number in (-1,0,1)
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    # if polarity > 0.5:
    #     sentiment = 1
    # elif polarity < -0.5:
    #     sentiment = -1
    # else:
    #     sentiment = 0
    
    # return sentiment
    return polarity


In [23]:
comments_df = comments_df.copy() #setting with copy warning 
comments_df.loc[:, 'sentiment'] = comments_df['body'].apply(calculate_sentiment)

In [24]:
submissions_df = submissions_df.copy()#setting with copy warning 
submissions_df.loc[:, 'sentiment'] = (submissions_df['title'] + submissions_df['selftext']).apply(calculate_sentiment)

In [25]:
#save to csv to use in another file 
comments_df.to_csv("comments.csv", index=False)
submissions_df.to_csv("submissions.csv", index=False)