In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys


In [16]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .config("spark.executor.extraPythonPackages", "vaderSentiment") \
    .config("spark.driver.extraPythonPackages", "vaderSentiment") \
    .config("spark.executorEnv.PYTHONPATH", ":".join(sys.path)) \
    .getOrCreate()

sc = spark.sparkContext

# sc.setLogLevel("INFO")

24/11/25 16:44:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


In [None]:
# Renaming columns for clarity
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")
df = df.withColumnRenamed("sentiment", "sentiment_original")

# Calculate the mean sentiment
mean_sentiment = df.agg(F.avg("sentiment")).first()[0]

# Fill null values in the sentiment column with the mean
df_filled_mean = df.fillna({'sentiment': mean_sentiment})

# Show the DataFrame schema and first few rows
df_filled_mean.printSchema()  # Print the schema of the filled DataFrame
df_filled_mean.show(5)        # Display the first few rows of the filled DataFrame


root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = false)
 |-- score: integer (nullable = true)

+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|   type|     id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|
+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|comment|hi1vsag|       2riyy|          nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|
|comment|hi1vs7i|       2qhov|     vancouver|         false| 16

In [19]:
df.count()

1777747

In [20]:
df.printSchema()

root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- score: integer (nullable = true)



### Clean Body Text

In [21]:
def clean_comment_spark(df, column):
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),  # Convert to lowercase
                        r"http\S+|www\S+|https\S+", ""),  # Remove URLs
                    r"\@w+|\#", ""),  # Remove mentions and hashtags
                r"[^\w\s]", "")  # Remove special characters and punctuation
            )
        )


In [22]:
# Apply text cleaning
df = clean_comment_spark(df, "body")


In [23]:
# Show the cleaned comments
df.select('body', 'body_clean').show(10, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Sentiment score calculation - VADER

In [24]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
analyzer_broadcast = sc.broadcast(analyzer)

In [25]:
# Define a function to calculate VADER sentiment
def calculate_sentiment(rows):
    analyzer = analyzer_broadcast.value
    for row in rows:
        text = row.body_clean
        if text:
            scores = analyzer.polarity_scores(text)
            yield row.body_clean, float(scores['compound'])
        else:
            yield row.body_clean, None

In [26]:
# Apply the sentiment calculation in a distributed manner
rdd = df.select("body_clean").rdd.mapPartitions(calculate_sentiment)

# Create a DataFrame from the sentiment RDD
sentiment_df = rdd.toDF(["body_clean", "sentiment"])

# Join the sentiment DataFrame back to the original DataFrame
df = df.join(sentiment_df, "body_clean", "left")

In [27]:
df.show(10)

ERROR:root:KeyboardInterrupt while sending command.               (10 + 6) / 16]
Traceback (most recent call last):
  File "/home/filigott/dat535-2024/project/venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/filigott/dat535-2024/project/venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

In [None]:
# Select only the required columns
result_df = df.select("comment_id", "sentiment")

# Show the result for verification
result_df.show(10, truncate=False)

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `sentiment` is ambiguous, could be: [`sentiment`, `sentiment`].

### Write results

In [None]:
sentiment_calculation_path = "data/results/sentiment_calculations.parquet"

df.write.mode("overwrite").parquet(sentiment_calculation_path)


### Read results

In [None]:
# File path for the Parquet file
file_path = "data/results/sentiment_calculations.parquet"

# Read the Parquet file into a DataFrame
df_result = spark.read.parquet(file_path)

df.show(10)
df.count()