In [None]:
# Import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Sentiment Calculation Pandas UDF") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Set log level for cleaner outputs
sc = spark.sparkContext
sc.setLogLevel("INFO")


In [None]:
# HDFS path for the dataset
file_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)

# Display schema and sample rows
df.printSchema()
df.show(5, truncate=False)

In [None]:
# Function to clean comments
def clean_comment_spark(df, column):
    """Clean comments in the specified column."""
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),  # Convert to lowercase
                        r"http\S+|www\S+|https\S+", ""),  # Remove URLs
                    r"@\w+|#", ""),  # Remove mentions and hashtags
                r"[^\w\s]", ""),  # Remove special characters and punctuation
            )
        )


In [None]:
# Clean the comments and select relevant columns
df = clean_comment_spark(df, "body").select("comment_id", "body_clean")
df.show(5, truncate=False)

In [None]:
# TODO: pip install pyspark-pandas!

# Define a Pandas UDF for sentiment analysis
@pandas_udf("float", PandasUDFType.SCALAR)
def calculate_sentiment_udf(body_clean: pd.Series) -> pd.Series:
    """Calculate sentiment score using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    return body_clean.apply(lambda text: analyzer.polarity_scores(text)['compound'] if text else None)

In [None]:
# Apply the UDF to calculate sentiment
df = df.withColumn("sentiment", calculate_sentiment_udf(F.col("body_clean")))

# Show a sample of the results
df.show(10, truncate=False)

In [None]:
# Write the results to HDFS in Parquet format
output_path = "hdfs://namenode:9000/data/results/sentiment_calculations_dynamic_partitions.parquet"
df.write.mode("overwrite").parquet(output_path)
print(f"Results written to {output_path}")

In [None]:
# Read back the results and display a preview
result_df = spark.read.parquet(output_path)
result_df.show(10, truncate=False)
result_count = result_df.count()
print(f"Total records processed: {result_count}")