In [0]:
from pyspark.sql import DataFrame

In [0]:
def load_csv_from_s3(file_path: str) -> DataFrame:
    """
    Loads a single CSV file from the public S3 interview bucket
    into a Spark DataFrame using fixed options.

    Args:
        file_path (str): The specific path of the file within the S3 bucket
                         (e.g., 'de/item.csv' or 'de/event.csv').

    Returns:
        DataFrame: The loaded Spark DataFrame.
    """
    S3_BASE_URL = "s3a://merkle-de-interview-case-study/"
    full_s3_path = S3_BASE_URL + file_path

    print(f"Loading data from: {full_s3_path}...")

    df = (spark.read
        .format("csv")
        .option("header", "true")      # Use the first row as column names
        .option("inferSchema", "true") # Automatically determine column data types
        .load(full_s3_path)
    )
    
    print("Data loaded successfully.")
    return df

In [0]:
# 1. Load the item.csv file
item_path = "de/item.csv"
item_df = load_csv_from_s3(item_path)

# 2. Load the event.csv file
event_path = "de/event.csv"
event_df = load_csv_from_s3(event_path)

# --- Verification ---
print("\n--- Item DataFrame Verification ---")
item_df.printSchema()
item_df.display(5)

print("\n--- Event DataFrame Verification ---")
event_df.printSchema()
event_df.display(5)