In [2]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime
import sys
sys.path.append('..')

from src.logging_utils import setup_logger, log_dataframe_stats

# Setup logger
logger = setup_logger(__name__, level="INFO")
logger.info("Starting Bronze layer ingestion")

2026-02-10 12:31:16 - __main__ - INFO - Starting Bronze layer ingestion


In [3]:
# Initialize Spark session (simplified for local development)
try:
    spark
    logger.info("Using existing Spark session")
except NameError:
    spark = SparkSession.builder \
        .appName("Bronze_Ingestion") \
        .config("spark.sql.catalogImplementation", "hive") \
        .getOrCreate()
    logger.info("Created new Spark session")

# Configuration
DATA_PATH = "../data"  # Local development path
BRONZE_PATH = "Tables/bronze"  # Bronze layer path (uses Parquet format)

# Note: Using Parquet format for local development (compatible with Delta in Fabric)

26/02/10 12:31:20 WARN Utils: Your hostname, kdcllc-ThinkPad-P16-Gen-2 resolves to a loopback address: 127.0.1.1; using 192.168.86.233 instead (on interface wlp0s20f3)
26/02/10 12:31:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/10 12:31:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


2026-02-10 12:31:21 - __main__ - INFO - Created new Spark session


## Task A: Load Customers Data

Read the customers CSV file and create a Bronze Delta table.

In [4]:
# Read customers CSV
logger.info("Reading customers.csv...")

customers_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(f"{DATA_PATH}/customers.csv")

# Add metadata columns for lineage
customers_bronze = customers_raw \
    .withColumn("ingestion_timestamp", F.current_timestamp()) \
    .withColumn("source_file", F.lit("customers.csv")) \
    .withColumn("bronze_layer_id", F.monotonically_increasing_id())

# Log statistics
log_dataframe_stats(customers_bronze, "customers_bronze", logger)

# Display sample
display(customers_bronze.limit(5))

2026-02-10 12:31:29 - __main__ - INFO - Reading customers.csv...
2026-02-10 12:31:33 - __main__ - INFO - DataFrame 'customers_bronze' statistics:
2026-02-10 12:31:33 - __main__ - INFO -   - Rows: 30
2026-02-10 12:31:33 - __main__ - INFO -   - Columns: 9
2026-02-10 12:31:33 - __main__ - INFO -   - Column names: customer_id, name, email, phone, signup_date, extra_info, ingestion_timestamp, source_file, bronze_layer_id
2026-02-10 12:31:33 - __main__ - INFO -   - Null counts:
2026-02-10 12:31:33 - __main__ - INFO -     customer_id: 1 (3.33%)
2026-02-10 12:31:33 - __main__ - INFO -     name: 1 (3.33%)
2026-02-10 12:31:33 - __main__ - INFO -     email: 1 (3.33%)
2026-02-10 12:31:33 - __main__ - INFO -     phone: 1 (3.33%)
2026-02-10 12:31:33 - __main__ - INFO -     signup_date: 1 (3.33%)
2026-02-10 12:31:33 - __main__ - INFO -     extra_info: 24 (80.00%)


DataFrame[customer_id: string, name: string, email: string, phone: string, signup_date: date, extra_info: string, ingestion_timestamp: timestamp, source_file: string, bronze_layer_id: bigint]

In [5]:
# Write to Parquet (Bronze layer)
# Note: In Azure Fabric, this would be Delta format
logger.info("Writing customers to Bronze table...")

customers_bronze.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(f"{BRONZE_PATH}/customers")

logger.info("✓ Customers Bronze table created successfully")

2026-02-10 12:31:38 - __main__ - INFO - Writing customers to Bronze table...
2026-02-10 12:31:38 - __main__ - INFO - ✓ Customers Bronze table created successfully


## Task A: Load Orders Data

Read the orders CSV file and create a Bronze Delta table.

In [6]:
# Read orders CSV
logger.info("Reading orders.csv...")

orders_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(f"{DATA_PATH}/orders.csv")

# Add metadata columns for lineage
orders_bronze = orders_raw \
    .withColumn("ingestion_timestamp", F.current_timestamp()) \
    .withColumn("source_file", F.lit("orders.csv")) \
    .withColumn("bronze_layer_id", F.monotonically_increasing_id())

# Log statistics
log_dataframe_stats(orders_bronze, "orders_bronze", logger)

# Display sample with various status cases
display(orders_bronze.select("order_id", "customer_id", "order_date", "status", "quantity", "price").limit(10))

2026-02-10 12:31:43 - __main__ - INFO - Reading orders.csv...
2026-02-10 12:31:43 - __main__ - INFO - DataFrame 'orders_bronze' statistics:
2026-02-10 12:31:43 - __main__ - INFO -   - Rows: 36
2026-02-10 12:31:43 - __main__ - INFO -   - Columns: 10
2026-02-10 12:31:43 - __main__ - INFO -   - Column names: order_id, order_date, customer_id, status, quantity, price, notes, ingestion_timestamp, source_file, bronze_layer_id
2026-02-10 12:31:44 - __main__ - INFO -   - Null counts:
2026-02-10 12:31:44 - __main__ - INFO -     order_date: 1 (2.78%)
2026-02-10 12:31:44 - __main__ - INFO -     customer_id: 1 (2.78%)
2026-02-10 12:31:44 - __main__ - INFO -     status: 1 (2.78%)
2026-02-10 12:31:44 - __main__ - INFO -     quantity: 1 (2.78%)
2026-02-10 12:31:44 - __main__ - INFO -     price: 1 (2.78%)
2026-02-10 12:31:44 - __main__ - INFO -     notes: 21 (58.33%)


DataFrame[order_id: int, customer_id: string, order_date: date, status: string, quantity: int, price: double]

In [7]:
# Write to Parquet (Bronze layer)
# Note: In Azure Fabric, this would be Delta format
logger.info("Writing orders to Bronze table...")

orders_bronze.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(f"{BRONZE_PATH}/orders")

logger.info("✓ Orders Bronze table created successfully")

2026-02-10 12:31:55 - __main__ - INFO - Writing orders to Bronze table...
2026-02-10 12:31:55 - __main__ - INFO - ✓ Orders Bronze table created successfully


## Data Quality Summary

Quick analysis of data quality issues in the Bronze layer.

In [8]:
# Check status value variations (important for Task B)
logger.info("Analyzing order status values...")

status_distribution = orders_bronze.groupBy("status").count().orderBy("status")
display(status_distribution)

logger.info("⚠️ Note: Status values have different cases (Complete, COMPLETE, complete)")
logger.info("   This will be addressed in the Silver layer transformation")

2026-02-10 12:32:00 - __main__ - INFO - Analyzing order status values...


DataFrame[status: string, count: bigint]

2026-02-10 12:32:00 - __main__ - INFO - ⚠️ Note: Status values have different cases (Complete, COMPLETE, complete)
2026-02-10 12:32:00 - __main__ - INFO -    This will be addressed in the Silver layer transformation
2026-02-10 12:32:00 - __main__ - INFO -    This will be addressed in the Silver layer transformation


In [None]:
# Summary statistics
total_customers = customers_bronze.count()
total_orders = orders_bronze.count()

print(f"""\n{'='*50}
Bronze Layer Ingestion Complete
{'='*50}
Customers ingested: {total_customers}
Orders ingested: {total_orders}

Next Steps:
→ Run notebook 02_Silver_Transform.ipynb to clean and transform data
{'='*50}\n""")


Bronze Layer Ingestion Complete
Customers ingested: 30
Orders ingested: 36

Next Steps:
→ Run notebook 02_Silver_Transform.ipynb to clean and transform data



: 