In [19]:
# to be used to load data gotten from get_reddit_data.py

import os
import pandas as pd
import gzip
import json
from textblob import TextBlob

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [20]:
#function to read data in from the compressed files and returns a dataframe holding the data 
def load_json_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json.gz"):
            with gzip.open(os.path.join(directory, filename), 'rt', encoding='utf-8') as f:
                for line in f:
                    data.append(json.loads(line))
    return pd.DataFrame(data)


In [21]:

comments_dir = 'reddit-data/comments'
submissions_dir = 'reddit-data/submissions'

comments_df = load_json_files(comments_dir)
submissions_df = load_json_files(submissions_dir)


# keep the necessary columns
# comments_df = comments_df[['body', 'score', 'subreddit', 'link_id', 'id', 'subreddit_id']]
# submissions_df = submissions_df[['num_comments', 'score', 'id', 'selftext', 'title']]


In [22]:
#some comments dont have a link_id, so we can remove them 
#since we wont be able to link back to the post on which they commneted 
    
#some submissions also have a null score, title and body which is needed for the sentiment analsysis

comments_df = comments_df.dropna(subset=['link_id'])
submissions_df = submissions_df.dropna(subset=['score', 'selftext', 'title'])


In [23]:
def calculate_sentiment(text): #rounds to the nearest number in (-1,0,1)
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    # if polarity > 0.5:
    #     sentiment = 1
    # elif polarity < -0.5:
    #     sentiment = -1
    # else:
    #     sentiment = 0
    
    # return sentiment
    return polarity


In [24]:
nltk.download(['stopwords', 'vader_lexicon', 'punkt', 'wordnet'])

# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package stopwords to /home/gli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gli/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/gli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# create preprocess_text function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

In [26]:
# create get_sentiment function
def get_nltk_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

In [27]:
comments_df['body_tokenized'] = comments_df['body'].apply(preprocess_text)
submissions_df['body_tokenized'] = (submissions_df['title'] + submissions_df['selftext']).apply(preprocess_text)

In [28]:
comments_df = comments_df.copy() #setting with copy warning 
comments_df.loc[:, 'sentiment'] = comments_df['body'].apply(calculate_sentiment)

In [29]:
submissions_df = submissions_df.copy()#setting with copy warning 
submissions_df.loc[:, 'sentiment'] = (submissions_df['title'] + submissions_df['selftext']).apply(calculate_sentiment)

In [30]:
comments_df['sentiment_nltk'] = comments_df['body_tokenized'].apply(get_nltk_sentiment)
submissions_df['sentiment_nltk'] = submissions_df['body_tokenized'].apply(get_nltk_sentiment)

In [31]:
#save to csv to use in another file 
comments_df.to_csv("comments.csv", index=False)
submissions_df.to_csv("submissions.csv", index=False)