In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql import types as T
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

In [None]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

sc = spark.sparkContext

In [12]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


In [None]:
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")

# Show the DataFrame schema and first few rows
df.printSchema()  # Print the schema
df.show(5)        # Display the first few rows

### Sentiment score calculation - VADER

In [5]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [6]:
# Define a function to apply VADER sentiment analysis
def calculate_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return float(scores['compound'])

In [14]:
# Register UDF to apply VADER sentiment
vader_sentiment_udf = F.udf(calculate_vader_sentiment)

# Apply VADER sentiment to each row and create a new column
df = df.withColumn("vader_sentiment", vader_sentiment_udf(F.col("body")))

### Average sentiment Per Subreddit

In [None]:
# 1. Average Sentiment per Subreddit
avg_sentiment_per_subreddit = df.groupBy("subreddit_name").agg(
    F.avg("vader_sentiment").alias("avg_sentiment")
)
avg_sentiment_per_subreddit.show(5)


In [20]:
# 2. Similar Subreddits Based on Sentiment
# Feature Engineering: Use average sentiment per subreddit as feature for clustering
feature_df = avg_sentiment_per_subreddit

In [21]:
# Assemble the features for clustering
assembler = VectorAssembler(inputCols=["avg_sentiment"], outputCol="features")
feature_df = assembler.transform(avg_sentiment_per_subreddit)

In [None]:
# Fit a KMeans model and calculate cluster labels
kmeans = KMeans(k=3, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(feature_df)
clustered_df = model.transform(feature_df)


In [29]:
# Define a UDF to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    # Convert to dense numpy arrays if needed
    vec1, vec2 = np.array(vec1), np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    # Avoid division by zero
    if norm_vec1 == 0 or norm_vec2 == 0:
        return float(0)
    return float(dot_product / (norm_vec1 * norm_vec2))

# Register the UDF with Spark
cosine_similarity_udf = F.udf(cosine_similarity, T.DoubleType())

In [None]:
# Pairwise cosine similarity of subreddits in each cluster
similarity_df = clustered_df.alias("df1").join(
    clustered_df.alias("df2"),
    (F.col("df1.cluster") == F.col("df2.cluster")) & (F.col("df1.subreddit_name") < F.col("df2.subreddit_name"))
).select(
    F.col("df1.subreddit_name").alias("subreddit_1"),
    F.col("df2.subreddit_name").alias("subreddit_2"),
    cosine_similarity_udf("df1.features", "df2.features").alias("similarity_score")
)


In [None]:
# Show results
similarity_df.show(5)

similarity_df.printSchema()

In [None]:
# 3. Correlation Between VADER Sentiment and Reddit Score
# Ensure both columns are numeric by casting to double
sentiment_score_relation = df.select(
    F.col("vader_sentiment").cast("double").alias("vader_sentiment"),
    F.col("score").cast("double").alias("score")
)

# Calculate the correlation between the two columns
correlation = sentiment_score_relation.corr("vader_sentiment", "score")
print(f"Correlation between VADER sentiment and Reddit score: {correlation}")

In [None]:
df.show(5)

In [None]:
# Convert Spark DataFrames to Pandas for visualization
avg_sentiment_df = avg_sentiment_per_subreddit.toPandas()
similarity_df_pd = similarity_df.toPandas()
sentiment_score_relation_pd = sentiment_score_relation.toPandas()


In [None]:
# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Visualization 1: Average Sentiment per Subreddit
plt.figure(figsize=(12, 6))
sns.barplot(x="avg_sentiment", y="subreddit_name", data=avg_sentiment_df.sort_values("avg_sentiment", ascending=False))
plt.title("Average Sentiment per Subreddit")
plt.xlabel("Average Sentiment")
plt.ylabel("Subreddit Name")
plt.axvline(0, color='red', linestyle='--')  # Line at sentiment 0 for reference
plt.show()

In [None]:
# Visualization 2: Similar Subreddits Based on Sentiment
# Pivot for heatmap
similarity_matrix = similarity_df_pd.pivot("subreddit_1", "subreddit_2", "similarity_score").fillna(0)

plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, cmap="coolwarm", annot=True, fmt=".2f", cbar_kws={'label': 'Similarity Score'})
plt.title("Cosine Similarity Between Subreddits Based on Sentiment")
plt.xlabel("Subreddit 2")
plt.ylabel("Subreddit 1")
plt.show()

In [None]:
# Visualization 3: Correlation Between Sentiment and Reddit Score
plt.figure(figsize=(10, 6))
sns.scatterplot(data=sentiment_score_relation_pd, x="vader_sentiment", y="score")
plt.title("Correlation Between VADER Sentiment and Reddit Score")
plt.xlabel("VADER Sentiment Score")
plt.ylabel("Reddit Score")
plt.axhline(0, color='red', linestyle='--')  # Line at score 0 for reference
plt.axvline(0, color='red', linestyle='--')  # Line at sentiment 0 for reference

In [None]:
# Calculate and plot the correlation line
sns.regplot(data=sentiment_score_relation_pd, x="vader_sentiment", y="score", scatter=False, color='blue')

plt.show()