##### 1. Import Required Libraries

In [1]:
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import (
    StructType, StructField, StringType, FloatType, DoubleType,
    LongType, IntegerType, BooleanType
)
from pyspark.sql.functions import (
    col, when, row_number, coalesce,
    from_unixtime, current_date, datediff,
    lit, current_timestamp, concat_ws,
    crc32, md5
)
from delta.tables import DeltaTable

StatementMeta(, dbdf5672-6fd7-410b-b31f-0e00edc71b66, 3, Finished, Available, Finished)

##### 2. Define Silver Layer Schema

In [2]:
silver_layer_schema = StructType([
    StructField("Submission_Fct_id", StringType(), False),
    StructField("pk_engagement_id", LongType(), False),
    StructField("pk_post_id", LongType(), False),
    StructField("pk_time_id", LongType(), False),
    StructField("pk_date_id", LongType(), False),
    StructField("pk_author_id", LongType(), False),
    StructField("sk_engagement_id", LongType(), False),
    StructField("sk_post_id", LongType(), False),
    StructField("sk_time_id", LongType(), False),
    StructField("sk_date_id", LongType(), False),
    StructField("sk_author_id", LongType(), False),
    StructField("author_name", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("month", IntegerType(), True),
    StructField("day", IntegerType(), True),
    StructField("hour", IntegerType(), True),
    StructField("weekday", StringType(), True),
    StructField("num_comments", IntegerType(), True),
    StructField("link_flair_text", StringType(), True),
    StructField("url", StringType(), True),
    StructField("title", StringType(), True),
    StructField("engagement_score", DoubleType(), True),
    StructField("Total_Awards_Received", IntegerType(), True),
    StructField("is_adult_content", IntegerType(), True),
    StructField("is_spoiler", IntegerType(), True),
    StructField("is_stickied", IntegerType(), True),
    StructField("award_rate", DoubleType(), True),
    StructField("score_upvote_ratio", DoubleType(), True),
    StructField("title_length", IntegerType(), True),
    StructField("post_age_days", IntegerType(), True),
    StructField("engagement_ratio", DoubleType(), True),
    StructField("has_awards", IntegerType(), True),
    StructField("has_crossposts", IntegerType(), True),
    StructField("Gilded_Count", IntegerType(), True),
    StructField("Number_of_Crossposts", IntegerType(), True),
    StructField("score", IntegerType(), True),
    StructField("upvote_ratio", FloatType(), True)
])

StatementMeta(, dbdf5672-6fd7-410b-b31f-0e00edc71b66, 4, Finished, Available, Finished)

##### 3. Helper Functions

In [10]:
def get_max_id(database_name, table_name, pk_column):
    if spark.catalog.tableExists(f"{database_name}.{table_name}"):
        max_id = (
            spark.sql(f"SELECT MAX({pk_column}) AS max_id FROM {database_name}.{table_name}")
            .collect()[0]["max_id"]
        )
        return max_id if max_id is not None else 0
    return 0

def process_table(database_name, table_name, pk_column, sk_column, sdf):
    
    sk_columns_mapping = {
        "dim_author": ["pk_author_id", "author_name"],
        "dim_date": ["year", "month", "day", "weekday"],
        "dim_time": ["hour"],
        "dim_post": ["url"],
        "dim_engagement_attributes": ["is_adult_content", "is_spoiler", "is_stickied", "has_awards", "has_crossposts"]
    }

    pk_columns_mapping = {
        "dim_author": ["author_name"],
        "dim_date": ["year"],
        "dim_time": ["hour"],
        "dim_post": ["url"],
        "dim_engagement_attributes": ["is_adult_content"]
    }

    pk_columns = pk_columns_mapping.get(table_name, ["some_column"])
    sk_columns = sk_columns_mapping.get(table_name, [c for c in sdf.columns if c != pk_column])

    max_id = get_max_id(database_name, table_name, pk_column)
    window_spec = Window.orderBy(*pk_columns)

    sdf = sdf.withColumn(pk_column, (row_number().over(window_spec) + max_id).cast("bigint"))

    sdf = sdf.withColumn(
        sk_column,
        crc32(md5(concat_ws("|", *[coalesce(col(c).cast("string"), lit("unknown")) for c in sk_columns])))
    )

    return sdf

def add_date_time_columns(sdf):
    return sdf.withColumn("created_at", from_unixtime(col("created_utc"))) \
              .withColumn("year", F.year("created_at")) \
              .withColumn("month", F.month("created_at")) \
              .withColumn("day", F.dayofmonth("created_at")) \
              .withColumn("hour", F.hour("created_at"))

def fill_missing_values(sdf):
    replacements = {
        "author_name": "no_name",
        "num_comments": 0,
        "link_flair_text": "No Link Flair",
        "title": "No title for this submission",
        "over_18": False,
        "spoiler": False,
        "stickied": False,
        "Total_Awards_Received": 0,
        "Number_of_Crossposts": 0
    }
    return sdf.fillna(replacements)

def add_derived_columns(sdf):
    return sdf.withColumn("engagement_score", col("score") + col("num_comments") * 0.5) \
              .withColumn("is_adult_content", when(col("over_18") == True, 1).otherwise(0)) \
              .withColumn("is_spoiler", when(col("spoiler") == True, 1).otherwise(0)) \
              .withColumn("is_stickied", when(col("stickied") == True, 1).otherwise(0)) \
              .withColumn("award_rate", when(col("num_comments") > 0, col("Total_Awards_Received") / col("num_comments")).otherwise(0.00)) \
              .withColumn("score_upvote_ratio", when(col("upvote_ratio") > 0, col("score") / col("upvote_ratio")).otherwise(0.00)) \
              .withColumn("title_length", F.length(col("title"))) \
              .withColumn("post_age_days", datediff(current_date(), F.to_date(from_unixtime(col("created_utc"))))) \
              .withColumn("engagement_ratio", when(col("Total_Awards_Received") > 0, col("num_comments") / col("Total_Awards_Received")).otherwise(0.00)) \
              .withColumn("has_awards", when(coalesce(col("Total_Awards_Received"), lit(0)) > 0, 1).otherwise(0)) \
              .withColumn("has_crossposts", when(coalesce(col("Number_of_Crossposts"), lit(0)) > 0, 1).otherwise(0)) \
              .withColumn("weekday", F.date_format(col("created_at"), "E"))

def save_to_metastore(spark_df, database_name, table_name, partition_columns=None):
    """
    Saves Spark DataFrame to Delta table with merge functionality
    
    Args:
        sdf: Spark DataFrame to save
        database_name: Target database name
        table_name: Target table name
        partition_columns: List of columns to partition by (optional)
    """
    # Validate input DataFrame
    if not isinstance(spark_df, DataFrame):
        raise ValueError("Input must be a Spark DataFrame")
    
    # Create database if not exists
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
    full_table_name = f"{database_name}.{table_name}"

    # Validate partition columns
    if partition_columns:
        missing_cols = [col for col in partition_columns if col not in spark_df.columns]
        if missing_cols:
            raise ValueError(f"Partition columns not found in DataFrame: {missing_cols}")


    # Write data to the table
    writer = spark_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true")

    if partition_columns:

        writer = writer.partitionBy(*partition_columns)

    writer.saveAsTable(full_table_name)

    print(f"Table {full_table_name} populated successfully.")

    # Refresh metadata and verify
    spark.catalog.refreshTable(full_table_name)
    print(f"✅ Successfully saved to '{full_table_name}'")
    print(f"📊 Row count: {spark_df.count()}")

StatementMeta(, dbdf5672-6fd7-410b-b31f-0e00edc71b66, 12, Finished, Available, Finished)

##### 4. Call Transformation Logic

In [11]:
# Load Bronze DataFrame
bronze_df = spark.read.format("delta").table("bronze_layer.reddit_extracted_data")

# Apply transformations
df = add_date_time_columns(bronze_df)
df = fill_missing_values(df)
df = add_derived_columns(df)

# Process dimensions
database = "gold_dimensional_modeling"
tables = [
    {"table_name": "dim_author", "pk_column": "pk_author_id", "sk_column": "sk_author_id"},
    {"table_name": "dim_date", "pk_column": "pk_date_id", "sk_column": "sk_date_id"},
    {"table_name": "dim_time", "pk_column": "pk_time_id", "sk_column": "sk_time_id"},
    {"table_name": "dim_post", "pk_column": "pk_post_id", "sk_column": "sk_post_id"},
    {"table_name": "dim_engagement_attributes", "pk_column": "pk_engagement_id", "sk_column": "sk_engagement_id"}
]

for t in tables:
    df = process_table(database, t["table_name"], t["pk_column"], t["sk_column"], df)

# Select final columns
final_columns = [field.name for field in silver_layer_schema.fields]
df = df.select(final_columns)
df = spark.createDataFrame(df.rdd, schema=silver_layer_schema)

# Save to Gold Layer
save_to_metastore(df, "Silver_Layer", "Transformed_Data", partition_columns=["year", "month"])


StatementMeta(, dbdf5672-6fd7-410b-b31f-0e00edc71b66, 13, Finished, Available, Finished)

Table Silver_Layer.Transformed_Data populated successfully.
✅ Successfully saved to 'Silver_Layer.Transformed_Data'
📊 Row count: 38389
