In [None]:
# Boilerplate Jupyter Notebook for Spark-based Data Pipeline Project

# Import necessary libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, FloatType, IntegerType


In [None]:
# -------------------------------------
# Part 1: Data Ingestion (Bronze Layer)
# -------------------------------------

# Initialize Spark
conf = SparkConf().setAppName("E-commerce Clickstream Analysis").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# Data Ingestion: Load the raw data (CSV, JSON, etc.) from cloud storage or local file system
# Example: Loading a CSV file from local storage
raw_data_path = "/path/to/your/dataset.csv"  # Modify this path
raw_df = spark.read.option("header", "true").csv(raw_data_path)

# Display raw data sample
print("Raw Data Sample:")
raw_df.show(5)

# Save raw data into the Bronze layer (Uncleaned/Raw Data)
# Optionally, you could save this to cloud storage, but for now, we'll store it locally
raw_data_bronze_path = "/path/to/bronze_layer/"
raw_df.write.mode("overwrite").parquet(raw_data_bronze_path)

In [None]:
# -------------------------------------
# Part 2: Data Cleaning (Silver Layer)
# -------------------------------------

# Define cleaning and transformation functions (using Spark RDDs and MapReduce approach)

# Function to clean and process the raw data
def clean_data(line):
    fields = line.split(",")  # Assuming the data is CSV, adjust as needed
    try:
        # Extract relevant fields
        # For example, assume fields like id, subreddit, body, sentiment, etc.
        id = fields[0]
        subreddit_id = fields[1]
        subreddit_name = fields[2]
        nsfw = fields[3]
        created_utc = int(fields[4])  # Convert to integer
        permalink = fields[5]
        body = fields[6]
        sentiment = float(fields[7])  # Convert to float
        score = int(fields[8])  # Convert to integer

        # Return cleaned data
        return (id, subreddit_id, subreddit_name, nsfw, created_utc, permalink, body, sentiment, score)
    except Exception as e:
        return None  # Skip bad rows

# Apply cleaning function using Spark RDD
raw_rdd = sc.textFile(raw_data_bronze_path)
cleaned_rdd = raw_rdd.map(clean_data).filter(lambda x: x is not None)

# Convert cleaned RDD back to DataFrame
cleaned_df = cleaned_rdd.toDF(["id", "subreddit_id", "subreddit_name", "nsfw", "created_utc", "permalink", "body", "sentiment", "score"])

# Display cleaned data sample
print("Cleaned Data Sample:")
cleaned_df.show(5)

# Save cleaned data to the Silver layer
cleaned_data_silver_path = "/path/to/silver_layer/"
cleaned_df.write.mode("overwrite").parquet(cleaned_data_silver_path)


In [None]:
# -------------------------------------
# Part 3: Data Serving (Gold Layer)
# -------------------------------------

# Business Use Case 1: Sentiment Analysis (Example using SQL)
# Load the cleaned data from the Silver layer
cleaned_df = spark.read.parquet(cleaned_data_silver_path)

# For example, let's find the average sentiment by subreddit
cleaned_df.createOrReplaceTempView("reddit_comments")

# Perform sentiment analysis using SQL
sentiment_analysis_df = spark.sql("""
    SELECT subreddit_name, AVG(sentiment) as avg_sentiment, COUNT(*) as comment_count
    FROM reddit_comments
    GROUP BY subreddit_name
    HAVING COUNT(*) > 50  -- Subreddits with more than 50 comments
    ORDER BY avg_sentiment DESC
""")

# Display sentiment analysis results
print("Sentiment Analysis Results:")
sentiment_analysis_df.show(10)

# Save the final data to the Gold layer (for further analysis or serving to a dashboard)
final_data_gold_path = "/path/to/gold_layer/"
sentiment_analysis_df.write.mode("overwrite").parquet(final_data_gold_path)


In [None]:
# -------------------------------------
# Part 4: Optional - Machine Learning Model (Bonus)
# -------------------------------------

# Example: Building a simple model to predict whether a comment will have a high score (classification task)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create features column (example: based on sentiment, created_utc)
assembler = VectorAssembler(inputCols=["sentiment", "created_utc"], outputCol="features")
ml_df = assembler.transform(cleaned_df)

# Define a binary label (e.g., high score or not)
ml_df = ml_df.withColumn("label", (col("score") > 5).cast(IntegerType()))

# Train a logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(ml_df)

# Evaluate the model
predictions = lr_model.transform(ml_df)
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Save the model for future use
model_path = "/path/to/saved_model/"
lr_model.save(model_path)


In [None]:
# -------------------------------------
# Clean up Spark resources
# -------------------------------------
sc.stop()