In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler, PCA
from pyspark.ml.functions import vector_to_array
from pyspark.ml.clustering import KMeans
from pyspark.ml.stat import Summarizer
from pyspark.ml.linalg import DenseVector
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import numpy as np

In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

sc = spark.sparkContext

your 131072x1 screen size is bogus. expect trouble
24/10/31 21:19:44 WARN Utils: Your hostname, FiligottLaptop resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/10/31 21:19:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 21:19:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# File path for the Parquet file
file_path = "data/the-reddit-covid-comments-sample.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)


                                                                                

In [4]:
# Renaming columns for clarity
df = df.withColumnRenamed("subreddit.id", "subreddit_id")
df = df.withColumnRenamed("subreddit.name", "subreddit_name")
df = df.withColumnRenamed("subreddit.nsfw", "subreddit_nsfw")

# Calculate the mean sentiment
mean_sentiment = df.agg(F.avg("sentiment")).first()[0]

# Fill null values in the sentiment column with the mean
df_filled_mean = df.fillna({'sentiment': mean_sentiment})

# Show the DataFrame schema and first few rows
df_filled_mean.printSchema()  # Print the schema of the filled DataFrame
df_filled_mean.show(5)        # Display the first few rows of the filled DataFrame


                                                                                

root
 |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name: string (nullable = true)
 |-- subreddit_nsfw: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- permalink: string (nullable = true)
 |-- body: string (nullable = true)
 |-- sentiment: double (nullable = false)
 |-- score: integer (nullable = true)

+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|   type|     id|subreddit_id|subreddit_name|subreddit_nsfw|created_utc|           permalink|                body|sentiment|score|
+-------+-------+------------+--------------+--------------+-----------+--------------------+--------------------+---------+-----+
|comment|hi1vsag|       2riyy|          nova|         false| 1635206399|https://old.reddi...|When you schedule...|      0.0|    2|
|comment|hi1vs7i|       2qhov|     vancouver|         false| 16

In [5]:
df.count()

1777747

### Vectorize the body text

In [6]:
# Step 1: Tokenize and remove stopwords
tokenizer = Tokenizer(inputCol="body", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [7]:
# Step 2: Compute TF-IDF
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)
idf = IDF(inputCol="raw_features", outputCol="features")


In [8]:
# Pipeline for processing
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf])
model = pipeline.fit(df)
tfidf_df = model.transform(df)

                                                                                

In [27]:
# Step 3: Average TF-IDF Vector per Subreddit
# Summarizer provides a way to calculate mean for vector columns
subreddit_vectors = tfidf_df.groupBy("subreddit_name").agg(
    Summarizer.mean(F.col("features")).alias("tfidf_vector")
)

In [10]:
# Cosine similarity function
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm1 = float(np.sqrt(vec1.dot(vec1)))
    norm2 = float(np.sqrt(vec2.dot(vec2)))
    return dot_product / (norm1 * norm2) if norm1 != 0 and norm2 != 0 else 0.0

In [11]:
# Register UDF
cosine_similarity_udf = F.udf(cosine_similarity, T.DoubleType())

In [12]:
# # Step 1: Self-join with distinct aliases for each subreddit name
# subreddit_pairs = subreddit_vectors.alias("a") \
#     .crossJoin(subreddit_vectors.alias("b")) \
#     .filter(F.col("a.subreddit_name") < F.col("b.subreddit_name")) \
#     .withColumn("cosine_similarity", cosine_similarity_udf(F.col("a.tfidf_vector"), F.col("b.tfidf_vector"))) \
#     .select(
#         F.col("a.subreddit_name").alias("subreddit_name_a"),
#         F.col("b.subreddit_name").alias("subreddit_name_b"),
#         F.col("cosine_similarity")
#     )

In [13]:
# # Filter pairs with high similarity for easier analysis
# similar_subreddits = subreddit_pairs.filter(F.col("cosine_similarity") > 0.8)

# # Show similar subreddit pairs, ordering by cosine similarity
# similar_subreddits \
#     .select("subreddit_name_a", "subreddit_name_b", "cosine_similarity") \
#     .orderBy(F.col("cosine_similarity").desc()) \
#     .show(10)

In [28]:
# Rename 'tfidf_vector' to 'features'
subreddit_vectors = subreddit_vectors.withColumnRenamed("tfidf_vector", "features")

In [None]:
# Assuming 'subreddit_vectors' contains the TF-IDF vectors for each subreddit
kmeans = KMeans(k=5, seed=123)  # You can change the number of clusters (k) as needed

# Fit the model
model = kmeans.fit(subreddit_vectors)


                                                                                

In [33]:
# Make predictions
predictions = model.transform(subreddit_vectors)

predictions = predictions.join(df_filled_mean.select("subreddit_name", "sentiment"), on="subreddit_name")

In [34]:
# Show the predictions (clusters assigned to each subreddit)
predictions.select("subreddit_name", "prediction").show()



+--------------+----------+
|subreddit_name|prediction|
+--------------+----------+
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|  1500isplenty|         0|
|         18_19|         0|
|        2000ad|         0|
|   2balkan4you|         0|
|   2balkan4you|         0|
|   2balkan4you|         0|
|   2balkan4you|         0|
+--------------+----------+
only showing top 20 rows



                                                                                

In [29]:
# Step 5: Analyze Sentiment by Cluster
sentiment_analysis = predictions.groupBy("prediction").agg(
    F.mean("sentiment").alias("average_sentiment"),
    F.count("subreddit_name").alias("subreddit_count")
).orderBy("average_sentiment")

sentiment_analysis.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `sentiment` cannot be resolved. Did you mean one of the following? [`features`, `prediction`, `subreddit_name`].;
'Aggregate [prediction#795], [prediction#795, avg('sentiment) AS average_sentiment#1734, count(subreddit_name#32) AS subreddit_count#1736L]
+- Project [subreddit_name#32, features#300, UDF(features#300) AS prediction#795]
   +- Project [subreddit_name#32, tfidf_vector#297 AS features#300]
      +- Aggregate [subreddit_name#32], [subreddit_name#32, aggregate_metrics(Mean, ComputeMean, ComputeWeightSum, features#263, 1.0, 0, 0).mean AS tfidf_vector#297]
         +- Project [type#0, id#1, subreddit_id#20, subreddit_name#32, subreddit_nsfw#43, created_utc#5, permalink#6, body#7, sentiment#8, score#9, words#210, filtered_words#228, raw_features#246, UDF(raw_features#246) AS features#263]
            +- Project [type#0, id#1, subreddit_id#20, subreddit_name#32, subreddit_nsfw#43, created_utc#5, permalink#6, body#7, sentiment#8, score#9, words#210, filtered_words#228, UDF(filtered_words#228) AS raw_features#246]
               +- Project [type#0, id#1, subreddit_id#20, subreddit_name#32, subreddit_nsfw#43, created_utc#5, permalink#6, body#7, sentiment#8, score#9, words#210, UDF(words#210) AS filtered_words#228]
                  +- Project [type#0, id#1, subreddit_id#20, subreddit_name#32, subreddit_nsfw#43, created_utc#5, permalink#6, body#7, sentiment#8, score#9, UDF(body#7) AS words#210]
                     +- Project [type#0, id#1, subreddit_id#20, subreddit_name#32, subreddit.nsfw#4 AS subreddit_nsfw#43, created_utc#5, permalink#6, body#7, sentiment#8, score#9]
                        +- Project [type#0, id#1, subreddit_id#20, subreddit.name#3 AS subreddit_name#32, subreddit.nsfw#4, created_utc#5, permalink#6, body#7, sentiment#8, score#9]
                           +- Project [type#0, id#1, subreddit.id#2 AS subreddit_id#20, subreddit.name#3, subreddit.nsfw#4, created_utc#5, permalink#6, body#7, sentiment#8, score#9]
                              +- Relation [type#0,id#1,subreddit.id#2,subreddit.name#3,subreddit.nsfw#4,created_utc#5,permalink#6,body#7,sentiment#8,score#9] parquet


In [20]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()

# Evaluate clustering by computing Silhouette score
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance = {silhouette}")




Silhouette with squared euclidean distance = 0.9250503133607962


                                                                                

### Write results

In [None]:
# Optional: Save the results
output_path = "data/results/similar_subreddits_by_text.parquet"
similar_subreddits.write.mode("overwrite").parquet(output_path)

### Read results

In [None]:
df_read = spark.read.parquet(output_path)

In [None]:
# Show some sample results
df_read.show(10)
print("Total Count:", df_read.count())
