In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble
24/10/31 18:59:06 WARN Utils: Your hostname, FiligottLaptop resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/10/31 18:59:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 18:59:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


                                                                                

In [4]:
# Renaming columns for clarity
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")

# Calculate the mean sentiment
mean_sentiment = df.agg(F.avg("sentiment")).first()[0]

# Fill null values in the sentiment column with the mean
df_filled_mean = df.fillna({'sentiment': mean_sentiment})

# Show the DataFrame schema and first few rows
df_filled_mean.printSchema()  # Print the schema of the filled DataFrame
df_filled_mean.show(5)        # Display the first few rows of the filled DataFrame


root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = false)
 |-- score: integer (nullable = true)

+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|   type|     id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|
+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|comment|hi1vsag|       2riyy|          nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|
|comment|hi1vs7i|       2qhov|     vancouver|         false| 16

In [5]:
df.count()

1777747

### Date and daily sentiment

In [6]:
# Convert 'created_utc' (epoch timestamp) to 'date' column in readable format
df = df.withColumn("date", F.from_unixtime(F.col("created_utc"), "yyyy-MM-dd"))

In [7]:
# Calculate the overall daily average sentiment
df_daily_sentiment_all = df.groupBy("date") \
    .agg(F.avg("sentiment").alias("avg_daily_sentiment_all")) \
    .orderBy("date")


In [8]:
# Calculate the average sentiment per day per subreddit
df_daily_sentiment_subreddit = df.groupBy("date", "subreddit_name") \
    .agg(F.avg("sentiment").alias("avg_daily_sentiment_subreddit")) \
    .orderBy("date", "subreddit_name")

In [9]:
# Join overall and subreddit trends on date to compare subreddit vs. overall sentiment
df_trend_comparison = df_daily_sentiment_subreddit.join(
    df_daily_sentiment_all, on="date", how="left"
).withColumn(
    "sentiment_diff", 
    F.col("avg_daily_sentiment_subreddit") - F.col("avg_daily_sentiment_all")
)

### Write results

In [10]:
# Define file paths for saved Parquet files
file_path_overall = "data/results/daily_sentiment_overall.parquet"
file_path_subreddit = "data/results/daily_sentiment_subreddit.parquet"
file_path_comparison = "data/results/sentiment_trend_comparison.parquet"


In [11]:
# Save the trend data to Parquet files for future use
df_daily_sentiment_all.write.mode("overwrite").parquet(file_path_overall)
df_daily_sentiment_subreddit.write.mode("overwrite").parquet(file_path_subreddit)
df_trend_comparison.write.mode("overwrite").parquet(file_path_comparison)


                                                                                

### Read results

In [12]:
# Load each Parquet file back into a DataFrame
df_overall = spark.read.parquet(file_path_overall)
df_subreddit = spark.read.parquet(file_path_subreddit)
df_comparison = spark.read.parquet(file_path_comparison)

In [13]:
# Show some sample results and counts for each DataFrame
print("Overall Daily Sentiment Trend (sample):")
df_overall.show(10)
print("Total Count (Overall Daily Sentiment Trend):", df_overall.count())


Overall Daily Sentiment Trend (sample):
+----------+-----------------------+
|      date|avg_daily_sentiment_all|
+----------+-----------------------+
|2021-09-06|   0.021666966678317287|
|2021-09-07|   0.018972046900336423|
|2021-09-08|   0.006638849958779908|
|2021-09-09|    0.02029829139981659|
|2021-09-10|   -0.00923499148336...|
|2021-09-11|   -0.01285521969230...|
|2021-09-12|   -0.00768669654569...|
|2021-09-13|   0.008088828050187884|
|2021-09-14|   -0.00375876786659...|
|2021-09-15|   -0.01135965006415501|
+----------+-----------------------+
only showing top 10 rows

Total Count (Overall Daily Sentiment Trend): 51


In [14]:
print("\nPer-Subreddit Daily Sentiment Trend (sample):")
df_subreddit.show(10)
print("Total Count (Per-Subreddit Daily Sentiment Trend):", df_subreddit.count())



Per-Subreddit Daily Sentiment Trend (sample):
+----------+-------------------+-----------------------------+
|      date|     subreddit_name|avg_daily_sentiment_subreddit|
+----------+-------------------+-----------------------------+
|2021-10-14|           business|          -0.7052499999999999|
|2021-10-14|       buyitforlife|                       0.1879|
|2021-10-14|          byebyejob|         -0.09062095588235293|
|2021-10-14|              c_s_t|                      -0.7998|
|2021-10-14|             caguns|                       0.1996|
|2021-10-14|           calculus|                       0.5574|
|2021-10-14|            calgary|          0.08762702702702703|
|2021-10-14|california_politics|                     -0.33293|
|2021-10-14|       callherdaddy|                       0.7269|
|2021-10-14|   callofdutymobile|                      0.48075|
+----------+-------------------+-----------------------------+
only showing top 10 rows

Total Count (Per-Subreddit Daily Sentiment Tr

In [15]:
print("\nSubreddit Sentiment Compared to Overall Trend (sample):")
df_comparison.show(10)
print("Total Count (Subreddit Sentiment Compared to Overall Trend):", df_comparison.count())


Subreddit Sentiment Compared to Overall Trend (sample):
+----------+--------------+-----------------------------+-----------------------+--------------------+
|      date|subreddit_name|avg_daily_sentiment_subreddit|avg_daily_sentiment_all|      sentiment_diff|
+----------+--------------+-----------------------------+-----------------------+--------------------+
|2021-10-25|       glasgow|         -0.12721904761904765|    0.03645235609738238|-0.16367140371643002|
|2021-10-25|    seattler4r|           0.2911000000000001|    0.03645235609738238|  0.2546476439026177|
|2021-10-25|       vechain|                        0.667|    0.03645235609738238|  0.6305476439026176|
|2021-10-25|  kingcobrajfs|          0.07853809523809523|    0.03645235609738238| 0.04208573914071285|
|2021-10-25|showerthoughts|          0.08346666666666665|    0.03645235609738238| 0.04701431056928427|
|2021-10-25|          rome|                      -0.5719|    0.03645235609738238| -0.6083523560973824|
|2021-10-25|    