In [60]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime, avg, date_format, count

In [61]:
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .getOrCreate()

In [None]:
file_path = "data/cleaned/cleaned-reddit-covid-comments.csv"
parquet_file_path = "data/cleaned/cleaned-reddit-covid-comments.parquet" 
df = spark.read.option("header", "true").parquet(parquet_file_path, inferSchema=True)
#df = df.withColumn("date", date_format(col("created_utc"), "yyyy-MM-dd"))
#df = df.withColumn("sentiment", col("sentiment").cast("float"))

In [None]:
df.show(20)

In [42]:
# Step 3: Group by subreddit_name, calculate the average sentiment, and count the number of comments
result_df = df.groupBy(col("subreddit_name").alias("subreddit_name")) \
    .agg(
        avg("sentiment").alias("average_sentiment"),
        count("id").alias("comment_count")  # Count the number of comments per subreddit
    ) \
    .filter(col("comment_count") >= 100).orderBy(col("average_sentiment").asc()) 

In [None]:
result_df.show(50)

In [None]:
## select rows where subreddit_name is "conspiracy"

conspiracy_df = df.filter(col("subreddit_name") == "conspiracy")
conspiracy_df.show(20)

In [48]:
import re
from pyspark.sql import functions as F

In [49]:
def clean_comment(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\@w+|\#','', text)  # Remove @mentions and #hashtags
    text = re.sub(r'\s+', ' ', text)    # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

clean_text_udf = F.udf(clean_comment, F.StringType())

In [None]:
# Apply the UDF to clean the comments
cleaned_comments_df = df.withColumn('cleaned_comment', clean_text_udf(F.col('body')))

# Show the cleaned comments
cleaned_comments_df.select('body', 'cleaned_comment').show(truncate=False)


In [54]:
from pyspark.sql import functions as F
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.types import FloatType

In [56]:
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    score = analyzer.polarity_scores(text)
    return float(score['compound'])

sentiment_udf = F.udf(get_vader_sentiment, FloatType())


In [None]:
comments_with_sentiment = cleaned_comments_df.withColumn('sentimentTest', sentiment_udf(F.col('cleaned_comment')))

# Show the result
comments_with_sentiment.show(20)