### Import Libraries

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import current_timestamp, lit, monotonically_increasing_id

### Define functions

In [0]:
def load_csv_from_s3(file_path: str) -> DataFrame:
    """
    Loads a single CSV file from the public S3 interview bucket
    into a Spark DataFrame using fixed options. All columns are loaded as string type.

    Args:
        file_path (str): The specific path of the file within the S3 bucket
                         (e.g., 'de/item.csv' or 'de/event.csv').

    Returns:
        DataFrame: The loaded Spark DataFrame.
    """
    S3_BASE_URL = "s3a://merkle-de-interview-case-study/"
    full_s3_path = S3_BASE_URL + file_path

    print(f"Loading data from: {full_s3_path}...")

    df = (
        spark.read
        .format("csv")
        .option("header", "true")             # Use the first row as column names
        .option("inferSchema", "false")       # All columns as string
        .option("multiLine", "true")          # Allow JSON values with line breaks (just in case)
        .option("quote", '"')                 # Handle quoted fields correctly
        .option("escape", '"')                # Handle inner quotes correctly
        .option("ignoreLeadingWhiteSpace", "true")
        .option("ignoreTrailingWhiteSpace", "true")
        .load(full_s3_path)
    )
    
    print("Data loaded successfully.")
    return df


def add_technical_columns(df: DataFrame, source_file: str) -> DataFrame:
    """
    Adds technical columns to the input DataFrame:
    - tc_ingestion_timestamp: current timestamp
    - tc_source_file: source file path
    - tc_bronze_id: unique id

    Args:
        df (DataFrame): Input Spark DataFrame.
        source_file (str): Source file path.

    Returns:
        DataFrame: DataFrame with technical columns added.
    """
    return (df
        .withColumn("tc_ingestion_timestamp", current_timestamp())
        .withColumn("tc_source_file", lit(source_file))
        .withColumn("tc_bronze_id", monotonically_increasing_id())
    )

### Load Data

In [0]:
# 1. Load the item.csv file
item_path = "de/item.csv"
item_df = load_csv_from_s3(item_path)

# 2. Load the event.csv file
event_path = "de/event.csv"
event_df = load_csv_from_s3(event_path)

### Add Technical Columns

In [0]:
# Add technical columns to item_df
item_df_bronze = add_technical_columns(item_df, item_path)

# Add technical columns to event_df
event_df_bronze = add_technical_columns(event_df, event_path)

### Save Tables

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS bronze_layer")
item_df_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze_layer.bronze_item")
event_df_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze_layer.bronze_event")
print("Bronze Delta tables created: bronze_layer.item, bronze_layer.event")