In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

In [12]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

sc = spark.sparkContext

In [13]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


In [14]:
# Renaming columns for clarity
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")

# Calculate the mean sentiment
mean_sentiment = df.agg(F.avg("sentiment")).first()[0]

# Fill null values in the sentiment column with the mean
df_filled_mean = df.fillna({'sentiment': mean_sentiment})

# Show the DataFrame schema and first few rows
df_filled_mean.printSchema()  # Print the schema of the filled DataFrame
df_filled_mean.show(5)        # Display the first few rows of the filled DataFrame


root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = false)
 |-- score: integer (nullable = true)

+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|   type|     id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|
+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|comment|hi1vsag|       2riyy|          nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|
|comment|hi1vs7i|       2qhov|     vancouver|         false| 16

In [15]:
df.count()

1777747

### Sentiment score calculation - VADER

In [16]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [17]:
# Define a function to apply VADER sentiment analysis
def calculate_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return float(scores['compound'])

In [18]:
def clean_comment(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\@w+|\#','', text)  # Remove @mentions and #hashtags
    text = re.sub(r'\s+', ' ', text)    # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

clean_text_udf = F.udf(clean_comment, F.StringType())

In [19]:
# Apply the UDF to clean the comments
df = df.withColumn('body_clean', clean_text_udf(F.col('body')))

# Show the cleaned comments
df.select('body', 'body_clean').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# Register UDF to apply VADER sentiment
vader_sentiment_udf = F.udf(calculate_vader_sentiment)

# Apply VADER sentiment to each row and create a new column
df = df.withColumn("sentiment_vader", vader_sentiment_udf(F.col("body_clean")))

df.show(10)

+-------+-------+------------+------------------+--------------+-----------+--------------------+--------------------+---------+-----+--------------------+---------------+
|   type|     id|subreddit_id|    subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|          body_clean|sentiment_vader|
+-------+-------+------------+------------------+--------------+-----------+--------------------+--------------------+---------+-----+--------------------+---------------+
|comment|hi1vsag|       2riyy|              nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|when you schedule...|            0.0|
|comment|hi1vs7i|       2qhov|         vancouver|         false| 1635206397|https://old.reddi...|Didn't stop price...|   0.1887|   32|didnt stop prices...|         0.1887|
|comment|hi1vs5n|       2qwzb|          pregnant|         false| 1635206397|https://old.reddi...|I’m just waiting ...|    0.672|    1|im jus

### Write results

In [21]:
sentiment_calculation_path = "data/results/sentiment_calculations.parquet"

df.write.mode("overwrite").parquet(sentiment_calculation_path)


24/10/30 09:55:12 WARN DAGScheduler: Broadcasting large task binary with size 1064.9 KiB
                                                                                

### Read results

In [22]:
# File path for the Parquet file
file_path = "data/results/sentiment_calculations.parquet"

# Read the Parquet file into a DataFrame
df_result = spark.read.parquet(file_path)

df.show(10)
df.count()

+-------+-------+------------+------------------+--------------+-----------+--------------------+--------------------+---------+-----+--------------------+---------------+
|   type|     id|subreddit_id|    subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|          body_clean|sentiment_vader|
+-------+-------+------------+------------------+--------------+-----------+--------------------+--------------------+---------+-----+--------------------+---------------+
|comment|hi1vsag|       2riyy|              nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|when you schedule...|            0.0|
|comment|hi1vs7i|       2qhov|         vancouver|         false| 1635206397|https://old.reddi...|Didn't stop price...|   0.1887|   32|didnt stop prices...|         0.1887|
|comment|hi1vs5n|       2qwzb|          pregnant|         false| 1635206397|https://old.reddi...|I’m just waiting ...|    0.672|    1|im jus

1777747