In [None]:
# Import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.stat import Summarizer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd

In [None]:
# Initialize Spark Session with Dynamic Allocation
spark = SparkSession.builder \
    .appName("Sentiment Analysis and Clustering") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("INFO")

In [None]:
# Read the Parquet file
file_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"
df = spark.read.parquet(file_path)
df.printSchema()
df.show(5, truncate=False)

In [None]:
# Clean comments
def clean_comment_spark(df, column):
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),
                        r"http\S+|www\S+|https\S+", ""),
                    r"@\w+|#", ""),
                r"[^\w\s]", "")
            )
        )
    
df = clean_comment_spark(df, "body").select("comment_id", "body_clean", "created_utc", "subreddit_name")

In [None]:
# Broadcast Sentiment Analyzer
analyzer_broadcast = sc.broadcast(SentimentIntensityAnalyzer())

# Sentiment calculation using RDDs
def calculate_sentiment(row):
    analyzer = analyzer_broadcast.value
    comment_id = row['comment_id']
    text = row['body_clean']
    sentiment_score = analyzer.polarity_scores(text)['compound'] if text else None
    return (comment_id, sentiment_score, row['created_utc'], row['subreddit_name'])

sentiment_rdd = df.rdd.map(calculate_sentiment)

schema = StructType([
    StructField("comment_id", StringType(), True),
    StructField("sentiment", FloatType(), True),
    StructField("created_utc", StringType(), True),
    StructField("subreddit_name", StringType(), True)
])

sentiment_df = spark.createDataFrame(sentiment_rdd, schema)
sentiment_df.show(10, truncate=False)

In [None]:
# Convert timestamp to date
df = sentiment_df.withColumn("date", F.from_unixtime(F.col("created_utc"), "yyyy-MM-dd"))

In [None]:
# Overall daily sentiment
df_daily_sentiment_all = df.groupBy("date").agg(F.avg("sentiment").alias("avg_daily_sentiment_all")).orderBy("date")

In [None]:
# Daily sentiment per subreddit
df_daily_sentiment_subreddit = df.groupBy("date", "subreddit_name") \
    .agg(F.avg("sentiment").alias("avg_daily_sentiment_subreddit")) \
    .orderBy("date", "subreddit_name")

In [None]:
# Join subreddit and overall trends
df_trend_comparison = df_daily_sentiment_subreddit.join(
    df_daily_sentiment_all, on="date", how="left"
).withColumn(
    "sentiment_diff",
    F.col("avg_daily_sentiment_subreddit") - F.col("avg_daily_sentiment_all")
)

In [None]:
# Tokenization and stopword removal
tokenizer = Tokenizer(inputCol="body_clean", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# TF-IDF
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)
idf = IDF(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf])
model = pipeline.fit(df)
tfidf_df = model.transform(df)

In [None]:
# KMeans clustering
kmeans = KMeans(k=5, seed=123)
model = kmeans.fit(tfidf_df)
predictions = model.transform(tfidf_df)

In [None]:
# Sentiment analysis by cluster
sentiment_analysis = predictions.groupBy("prediction").agg(
    F.mean("sentiment").alias("average_sentiment"),
    F.count("subreddit_name").alias("subreddit_count")
).orderBy("average_sentiment")

In [None]:
# Save results
output_path = "hdfs://namenode:9000/data/results/sentiment_analysis.parquet"
predictions.write.mode("overwrite").parquet(output_path)