In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime
import sys
sys.path.append('..')

from src.logging_utils import setup_logger, log_dataframe_stats

# Setup logger
logger = setup_logger(__name__, level="INFO")
logger.info("Starting Bronze layer ingestion")

In [None]:
# Initialize Spark session (if not already available in Fabric)
# In Fabric, spark session is pre-configured
try:
    spark
    logger.info("Using existing Spark session")
except NameError:
    spark = SparkSession.builder \
        .appName("Bronze_Ingestion") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    logger.info("Created new Spark session")

# Configuration
DATA_PATH = "../data"  # Local development
# DATA_PATH = "/lakehouse/default/Files"  # Fabric Lakehouse path

BRONZE_PATH = "Tables/bronze"  # Bronze layer path

## Task A: Load Customers Data

Read the customers CSV file and create a Bronze Delta table.

In [None]:
# Read customers CSV
logger.info("Reading customers.csv...")

customers_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(f"{DATA_PATH}/customers.csv")

# Add metadata columns for lineage
customers_bronze = customers_raw \
    .withColumn("ingestion_timestamp", F.current_timestamp()) \
    .withColumn("source_file", F.lit("customers.csv")) \
    .withColumn("bronze_layer_id", F.monotonically_increasing_id())

# Log statistics
log_dataframe_stats(customers_bronze, "customers_bronze", logger)

# Display sample
display(customers_bronze.limit(5))

In [None]:
# Write to Delta Lake (Bronze layer)
logger.info("Writing customers to Bronze Delta table...")

customers_bronze.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"{BRONZE_PATH}/customers")

logger.info("✓ Customers Bronze table created successfully")

## Task A: Load Orders Data

Read the orders CSV file and create a Bronze Delta table.

In [None]:
# Read orders CSV
logger.info("Reading orders.csv...")

orders_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(f"{DATA_PATH}/orders.csv")

# Add metadata columns for lineage
orders_bronze = orders_raw \
    .withColumn("ingestion_timestamp", F.current_timestamp()) \
    .withColumn("source_file", F.lit("orders.csv")) \
    .withColumn("bronze_layer_id", F.monotonically_increasing_id())

# Log statistics
log_dataframe_stats(orders_bronze, "orders_bronze", logger)

# Display sample with various status cases
display(orders_bronze.select("order_id", "customer_id", "order_date", "status", "quantity", "price").limit(10))

In [None]:
# Write to Delta Lake (Bronze layer)
logger.info("Writing orders to Bronze Delta table...")

orders_bronze.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"{BRONZE_PATH}/orders")

logger.info("✓ Orders Bronze table created successfully")

## Data Quality Summary

Quick analysis of data quality issues in the Bronze layer.

In [None]:
# Check status value variations (important for Task B)
logger.info("Analyzing order status values...")

status_distribution = orders_bronze.groupBy("status").count().orderBy("status")
display(status_distribution)

logger.info("⚠️ Note: Status values have different cases (Complete, COMPLETE, complete)")
logger.info("   This will be addressed in the Silver layer transformation")

In [None]:
# Summary statistics
total_customers = customers_bronze.count()
total_orders = orders_bronze.count()

print(f"""\n{'='*50}
Bronze Layer Ingestion Complete
{'='*50}
Customers ingested: {total_customers}
Orders ingested: {total_orders}

Next Steps:
→ Run notebook 02_Silver_Transform.ipynb to clean and transform data
{'='*50}\n""")