In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

sc = spark.sparkContext

In [13]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


In [14]:
# Renaming columns for clarity
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")

# Calculate the mean sentiment
mean_sentiment = df.agg(F.avg("sentiment")).first()[0]

# Fill null values in the sentiment column with the mean
df = df.fillna({'sentiment': mean_sentiment})

# Show the DataFrame schema and first few rows
df.printSchema()  # Print the schema of the filled DataFrame
df.show(5)        # Display the first few rows of the filled DataFrame


root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = false)
 |-- score: integer (nullable = true)

+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|   type|     id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|
+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|comment|hi1vsag|       2riyy|          nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|
|comment|hi1vs7i|       2qhov|     vancouver|         false| 16

In [15]:
df.count()

1777747

### Average sentiment Per Subreddit

In [16]:
# 1. Average Sentiment per Subreddit
avg_sentiment_per_subreddit = df.groupBy("subreddit_name").agg(
    F.avg("sentiment").alias("avg_sentiment")
)
avg_sentiment_per_subreddit.show(10)


+----------------+--------------------+
|  subreddit_name|       avg_sentiment|
+----------------+--------------------+
|          travel| 0.34720585424110667|
|            snhu| 0.22637999999999997|
|      traderjoes|  0.2185318181818182|
|debitismus_forum|-0.04006802554303...|
|       methadone| 0.14535849673202614|
|           anime| 0.24444842555210985|
|    couchsurfing| 0.42876000000000003|
|   gastricsleeve|  0.2631261174590532|
|          scjerk|-0.00946678554058...|
|   crohnsdisease| 0.10366811023622047|
+----------------+--------------------+
only showing top 10 rows



In [17]:
# 1. Average Sentiment per Subreddit
df = df.groupBy("subreddit_name").agg(
    F.avg("sentiment").alias("avg_sentiment"),
    F.count("id").alias("comment_count"),  # Count of posts per subreddit
    F.expr("percentile_approx(sentiment, 0.5)").alias("median_sentiment")  # Median sentiment
)


In [18]:
# Show the top 10 subreddits with the average sentiment and post counts
df.orderBy("avg_sentiment", ascending=False).show(10)




+--------------------+-------------+-------------+----------------+
|      subreddit_name|avg_sentiment|comment_count|median_sentiment|
+--------------------+-------------+-------------+----------------+
|u_rowan-the-girlf...|       0.9999|            1|          0.9999|
|       thegreatgasly|       0.9997|            1|          0.9997|
|              umiami|       0.9997|            1|          0.9997|
|               puffy|       0.9996|            1|          0.9996|
|     u_mylastchapter|       0.9995|            1|          0.9995|
|               bones|       0.9995|            1|          0.9995|
|          irishmusic|       0.9995|            1|          0.9995|
|             vieques|       0.9995|            1|          0.9995|
|           dcuonline|       0.9994|            1|          0.9994|
|       hotspringspas|       0.9994|            1|          0.9994|
+--------------------+-------------+-------------+----------------+
only showing top 10 rows



                                                                                

In [19]:
# Show the top 10 subreddits with the average sentiment and post counts
df.orderBy("avg_sentiment", ascending=True).show(10)


+-------------------+-------------+-------------+----------------+
|     subreddit_name|avg_sentiment|comment_count|median_sentiment|
+-------------------+-------------+-------------+----------------+
|         anticommie|      -0.9998|            2|         -0.9998|
|   falseadvertising|      -0.9992|            1|         -0.9992|
|        whitepride_|      -0.9992|            1|         -0.9992|
|   asksocialscience|      -0.9991|            1|         -0.9991|
|   covidateyourface|       -0.999|            1|          -0.999|
|          artadvice|     -0.99895|            2|         -0.9991|
|          meatballs|      -0.9989|            1|         -0.9989|
|          atfopenup|      -0.9985|            1|         -0.9985|
|shittymobilegameads|      -0.9985|            1|         -0.9985|
|  iforgorthesubname|      -0.9985|            1|         -0.9985|
+-------------------+-------------+-------------+----------------+
only showing top 10 rows



In [20]:
# Optional: Filter out subreddits with fewer than a certain number of posts for more meaningful analysis
min_comment_count = 50  # Change this threshold as needed
filtered_avg_sentiment = df.filter(F.col("comment_count") >= min_comment_count)



In [21]:
# Show the filtered results
filtered_avg_sentiment.orderBy("avg_sentiment", ascending=False).show(10)



+-----------------+------------------+-------------+----------------+
|   subreddit_name|     avg_sentiment|comment_count|median_sentiment|
+-----------------+------------------+-------------+----------------+
|         r4rasian|0.9680937308868541|         1308|          0.9712|
|         abortion|0.9418763432237328|         1247|          0.9688|
|         startrek| 0.938179064056261|         2515|          0.9562|
|baltimoreanddcr4r| 0.928099999999983|         4094|          0.9281|
|          bourbon|0.9043954110898693|         1046|          0.9704|
|           borrow|0.8941376623376622|           77|          0.9565|
|  healthinsurance|0.8833896805896826|          814|           0.953|
|           bizsmg|0.8433243512974052|          501|          0.9969|
|     irishtourism|0.7979388392857141|          224|          0.9454|
|       nycmeetups|0.7672459016393461|          610|          0.7685|
+-----------------+------------------+-------------+----------------+
only showing top 10 

                                                                                

In [22]:
filtered_avg_sentiment.orderBy("avg_sentiment", ascending=True).show(10)


+--------------------+--------------------+-------------+----------------+
|      subreddit_name|       avg_sentiment|comment_count|median_sentiment|
+--------------------+--------------------+-------------+----------------+
|        covid19_ohio| -0.4679525575253793|          179|         -0.8624|
|         letterkenny| -0.3409493463543523|          453|         -0.6191|
|      redditsecurity|  -0.338291935483871|           62|         -0.5994|
|        gradeaundera| -0.3330597560975609|          164|         -0.4939|
|           flatearth| -0.3312639061310989|           65|         -0.6249|
| subredditsummarybot|-0.32729189615137366|         1196|         -0.6124|
|hermancainawardsucks|-0.30271234767714283|          195|         -0.6059|
|   politicalopinions| -0.2965837837837837|           74|         -0.5778|
|      hankaaronaward| -0.2892778910002566|          334|            -0.5|
|    coronavirusidaho|           -0.288692|           50|         -0.4939|
+--------------------+---

### Write results

In [23]:
per_subreddit_stats_path = "data/results/per_subreddit_stats.parquet"

df.write.mode("overwrite").parquet(per_subreddit_stats_path)


                                                                                

### Read results

In [24]:
# File path for the Parquet file
file_path = "data/results/sentiment_calculations.parquet"

# Read the Parquet file into a DataFrame
df_result = spark.read.parquet(file_path)

df.show(10)
df.count()

+--------------+--------------------+-------------+--------------------+
|subreddit_name|       avg_sentiment|comment_count|    median_sentiment|
+--------------+--------------------+-------------+--------------------+
| 12step_escape|              0.2763|            1|              0.2763|
|  1500isplenty| 0.46612142857142863|           14|              0.6222|
|         18_19|              0.8338|            1|              0.8338|
|        2000ad|             -0.4215|            1|             -0.4215|
|       2asia4u|  0.1213054160099026|           44|              0.0516|
|   2balkan4you| -0.1201626082922619|           36|             -0.4717|
|          350z|              0.4064|            9|                0.34|
|      3atatime|0.010246101478571794|            1|0.010246101478571794|
|         4chan|  -0.164401238990117|         1087|             -0.2023|
|          4ktv| 0.22467826086956522|           23|              0.4215|
+--------------+--------------------+-------------+

23265