In [25]:
# to be used to load data gotten from get_reddit_data.py

import os

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from pyspark.ml.feature import StopWordsRemover

In [26]:
import sys
from pyspark.sql import SparkSession, types, functions
from pyspark.sql.functions import concat, round, date_format, monotonically_increasing_id 
from pyspark.sql.types import StringType
from math import sqrt

spark = SparkSession.builder.appName('reddit-submissions-loader').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 8) # make sure we have Python 3.8+
assert spark.version >= '3.2' # make sure we have Spark 3.2+

In [27]:
submissions_dir = 'reddit-data/submissions'

subs_schema = types.StructType([
    types.StructField('id', types.StringType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('title', types.StringType()),
    types.StructField('selftext', types.StringType()),
    types.StructField('score', types.LongType()),
    types.StructField('upvote_ratio', types.FloatType()),
    types.StructField('num_comments', types.LongType()),
    types.StructField('date_created', types.TimestampType()),
])

submissions_df = spark.read.json(submissions_dir, schema=subs_schema)

In [28]:
#some comments dont have a link_id, so we can remove them 
#since we wont be able to link back to the post on which they commneted 
    
#some submissions also have a null score, upvote_ratio, title and body which is needed
submissions_df = submissions_df.dropna()

In [29]:
# turn timestamps into month and time of day

submissions_df = submissions_df.withColumn('time_created', date_format('date_created', 'HH:mm:ss'))
submissions_df = submissions_df.drop('date_created')

In [30]:
nltk.download(['stopwords', 'vader_lexicon', 'punkt', 'wordnet'])

# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package stopwords to /home/gli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gli/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/gli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
def sentiment_analysis(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    stopwords_eng = stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopwords_eng]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    scores = analyzer.polarity_scores(text)
    return scores['compound']

sentiment_udf = functions.udf(sentiment_analysis,
                        returnType=types.FloatType())

In [32]:
submissions_df = submissions_df.withColumn('sentiment_score', sentiment_udf(concat(submissions_df['title'], submissions_df['selftext'])))

In [33]:
# get number of up and downvotes
# formula for downvotes: d = (s*(1-r)) / (2r-1)
# we get this from the 2 formulas: score = s = u-d and upvote_ratio = r = u/(u+d)

submissions_df = submissions_df.withColumn('downs', round(submissions_df['score']*(1-submissions_df['upvote_ratio']) / (2*submissions_df['upvote_ratio']-1)))
submissions_df = submissions_df.withColumn('ups', submissions_df['score'] + submissions_df['downs'])
submissions_df = submissions_df.dropna()

In [34]:
def confidence(ups, downs):
    # from https://stackoverflow.com/a/10029645 which adapted this from https://medium.com/hacking-and-gonzo/how-reddit-ranking-algorithms-work-ef111e33d0d9
    # this uses the wilson score interval, which takes into account both the ratio of ups/downs and the total number of them
        # so a post with only 1 upvote would rank lower than say a post with 10 upvotes and 2 downvotes, even though it has a higher ratio
        # the float output is the probability of the next person who sees posts upvoting it (as opposed to downvoting) (?)
    # ^ my understanding of this, will research more for final paper if we do use this
    n = ups + downs

    if n == 0:
        return 0.0

    z = 1.0 #1.44 = 85%, 1.96 = 95%
    phat = float(ups) / n
    return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
    
confidence_udf = functions.udf(confidence,
                        returnType=types.FloatType())

In [35]:
submissions_df = submissions_df.withColumn('pop_score', confidence_udf(submissions_df['ups'], submissions_df['downs']))

In [36]:
# add column for if controversial or not
submissions_df = submissions_df.withColumn('controversial', submissions_df['upvote_ratio'] < 0.8)

In [37]:
# drop unnecessary columns
submissions_df = submissions_df.select('id', 'subreddit', 'title', 'time_created', 'num_comments', 'sentiment_score', 'pop_score', 'controversial')


In [38]:
submissions_df.write.format("csv").save("submissions_cleaned", mode="overwrite", header=True)

                                                                                