In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, udf
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType
import random

In [None]:
# -------------------------------------
# Step 1: Initialize Spark
# -------------------------------------

spark = SparkSession.builder \
    .appName("Reddit Comments Scrambling") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext  # Access SparkContext for RDD operations


In [None]:
# -------------------------------------
# Step 2: Data Ingestion (Bronze Layer)
# -------------------------------------

# TODO: Needs to have files on hdfs
# raw_data_path = "hdfs://your_hdfs_path/data/subset/100000-reddit-covid-comments.csv"  # Path on HDFS
raw_data_path = "data/subset/100000-reddit-covid-comments.csv"  # local path

# Set CSV options to handle multiline fields
csv_options = {
    "header": "true",            # CSV contains a header row
    "multiLine": "true",         # Enable reading multiline fields
    "escape": "\"",              # Use double quotes as the escape character for special characters
    "quote": "\"",               # Ensure that quotes in fields are correctly handled
    "mode": "DROPMALFORMED"      # Drop any malformed rows to avoid parser errors
}

# Read the CSV file with the new options
raw_df = spark.read.options(**csv_options).csv(raw_data_path)

# Display raw data sample
print("Raw Data Sample:")
raw_df.show(5)

# Save raw data into the Bronze layer on HDFS
# raw_data_bronze_path = "hdfs://your_hdfs_path/data/bronze_layer/"
# raw_df.write.mode("overwrite").parquet(raw_data_bronze_path)

In [29]:
# Convert DataFrame to RDD for MapReduce-like operations
def extract_fields(row):
    """Extract relevant fields from each row of the DataFrame."""
    return (
        row['type'], row['id'], row['subreddit.id'], 
        row['subreddit.name'], row['subreddit.nsfw'], 
        row['created_utc'], row['permalink'], 
        row['body'], row['sentiment'], row['score']
    )

# Convert DataFrame to RDD
raw_rdd = raw_df.rdd.map(extract_fields)


In [30]:
# -------------------------------------
# Step 3: Data Scrambling
# -------------------------------------
def scramble_row(row):
    """
    Scramble the row by replacing commas, inserting random line breaks, 
    and converting the row into an unstructured key-value format.
    """
    
    # Extracting fields
    type_and_id_field = f"{row[0]}: {row[1]}" 
    # id_field = f"id: {row[1]}"
    subreddit_id = f"subreddit.id: {row[2]}"
    subreddit_name = f"subreddit.name: {row[3]}"
    subreddit_nsfw = f"subreddit.nsfw {row[4]}:"
    created_utc = f"created_utc: {row[5]}"
    permalink = f"permalink: {row[6]}"
    sentiment = f"sentiment: {row[7]}"

    body = row[8]

    # Randomly insert a line break in the body to make it messy
    if random.random() > 0.7:  # 30% chance to break the body
        split_point = random.randint(0, len(body) // 2)  # Random split location
        body = body[:split_point] + "\n" + body[split_point:]

    body_field = f"body: {body}"
    score = f"score: {row[9]}"

    fields = [
        type_and_id_field, subreddit_id, subreddit_name, subreddit_nsfw,
        created_utc, permalink, sentiment, body_field, score
    ]

    # Join the fields with newlines to create an unstructured format
    row_str = "\n".join(fields)

    return row_str


In [None]:
# Scramble data using Map function (MapReduce style)
scrambled_rdd = raw_rdd.map(scramble_row)

# Debug: Check the counts and a sample
print("Count of raw RDD:", raw_rdd.count())
print("Count of scrambled RDD:", scrambled_rdd.count())
print("Sample rows from scrambled RDD:", scrambled_rdd.take(5))


In [None]:
# Save scrambled data to HDFS as a text file
# scrambled_data_path = "hdfs://your_hdfs_path/data/scrambled_layer/scrambled-reddit-covid-comments.txt"
import os
import shutil


scrambled_data_path = "data/scrambled/spark-scrambled-reddit-covid-comments.txt"

# Clean up old output directory if it exists
scrambled_data_path = "data/scrambled/spark-scrambled-reddit-covid-comments"
if os.path.exists(scrambled_data_path):
    shutil.rmtree(scrambled_data_path)  # Delete the existing directory

try:
    # Save scrambled data to local filesystem as a text file
    scrambled_rdd.saveAsTextFile(scrambled_data_path)
    print(f"Scrambled data saved to: {scrambled_data_path}")
except Exception as e:
    print(f"Error saving scrambled data: {e}")

In [None]:
# -------------------------------------
# Step 4: Data Cleaning (Silver Layer)
# -------------------------------------
def clean_data(unstructured_row):
    """
    Clean the unstructured row by extracting the key-value pairs and reconstructing the original row.
    Handles extra spaces, duplicated fields, and missing data.
    """
    # Split the row by newlines and remove empty lines
    fields = [f.strip() for f in unstructured_row.split("\n") if f.strip()]
    
    # Dictionary to store the extracted key-value pairs
    field_dict = {}

    for field in fields:
        if ": " in field:
            key, value = field.split(": ", 1)  # Split on the first occurrence of ": "
            key = key.strip()
            value = value.strip()
            
            # Handle duplicate keys by ignoring subsequent ones
            if key not in field_dict:
                field_dict[key] = value

    # Reconstruct fields, using defaults where necessary
    type_field = field_dict.get("comment", field_dict.get("post", "post"))
    id_field = field_dict.get("id", "")
    subreddit_id = field_dict.get("subreddit.id", "")
    subreddit_name = field_dict.get("subreddit.name", "")
    subreddit_nsfw = field_dict.get("subreddit.nsfw", "False")  # Default to 'False'
    created_utc = field_dict.get("created_utc", "")
    permalink = field_dict.get("permalink", "")
    sentiment = field_dict.get("sentiment", "NULL")
    body = field_dict.get("body", "").replace("\n", " ").strip()  # Join body
    score = field_dict.get("score", "NULL")

    # Return as a structured tuple
    return (type_field, id_field, subreddit_id, subreddit_name, subreddit_nsfw, created_utc, permalink, body, sentiment, score)


In [None]:
# Read the scrambled data from HDFS (this is now an RDD)
scrambled_rdd = sc.textFile(scrambled_data_path)

# Clean the data using map and flatMap (to handle multiple lines returned by clean_data)
cleaned_rdd = scrambled_rdd.flatMap(lambda line: clean_data(line))

# Save cleaned data to the Silver layer on HDFS
# cleaned_data_silver_path = "hdfs://your_hdfs_path/data/silver_layer/"

cleaned_data_silver_path = "/data/cleaned/"

cleaned_rdd.saveAsTextFile(cleaned_data_silver_path)


In [None]:
# -------------------------------------
# Step 5: DataFrame for Gold Layer
# -------------------------------------
# Convert cleaned RDD back to DataFrame for potential SQL-based operations
cleaned_df = cleaned_rdd.toDF(["type", "id", "subreddit.id", "subreddit.name", "subreddit.nsfw", "created_utc", "permalink", "body", "score"])

# Save cleaned data to the Silver layer in Parquet format
cleaned_data_silver_parquet_path = "hdfs://your_hdfs_path/data/silver_layer_parquet/"
cleaned_df.write.mode("overwrite").parquet(cleaned_data_silver_parquet_path)

# Now the DataFrame is ready for future SQL or analytics in the Gold layer.


In [None]:
# -------------------------------------
# Part 6: Data Serving (Gold Layer)
# -------------------------------------
# You can load the cleaned Parquet data and perform advanced queries/analytics using Spark SQL for the Gold layer

# Example: Analyze subreddit statistics
cleaned_df.createOrReplaceTempView("reddit_comments")
# Example SQL operation: Sentiment analysis or any business logic

# TODO: This is probably too simple, as it just counts the score (thumps up/down)

result_df = spark.sql("""
    SELECT subreddit_name, COUNT(*) as comment_count
    FROM reddit_comments
    GROUP BY subreddit_name
    HAVING COUNT(*) > 50
    ORDER BY comment_count DESC
""")

# Show results
result_df.show()

# Save the final results to the Gold layer (for dashboard or further analysis)
final_data_gold_path = "hdfs://your_hdfs_path/data/gold_layer/"
result_df.write.mode("overwrite").parquet(final_data_gold_path)
