In [1]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import sys
sys.path.append('..')

from src.logging_utils import setup_logger, log_dataframe_stats
from src.quality_checks import check_data_quality
from src.silver import clean_customer_data, clean_order_data, join_orders_with_customers, compute_monthly_revenue

# Setup logger
logger = setup_logger(__name__, level="INFO")
logger.info("Starting Silver layer transformation")

2026-02-10 12:42:26 - __main__ - INFO - Starting Silver layer transformation


In [2]:
# Initialize Spark session (simplified for local development)
try:
    spark
    logger.info("Using existing Spark session")
except NameError:
    spark = SparkSession.builder \
        .appName("Silver_Transform") \
        .config("spark.sql.catalogImplementation", "hive") \
        .getOrCreate()
    logger.info("Created new Spark session")

# Configuration
BRONZE_PATH = "Tables/bronze"
SILVER_PATH = "Tables/silver"

26/02/10 12:42:30 WARN Utils: Your hostname, kdcllc-ThinkPad-P16-Gen-2 resolves to a loopback address: 127.0.1.1; using 192.168.86.233 instead (on interface wlp0s20f3)
26/02/10 12:42:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/10 12:42:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


2026-02-10 12:42:31 - __main__ - INFO - Created new Spark session


## Step 1: Load Bronze Data

In [3]:
# Read Bronze tables (Parquet format for local development)
logger.info("Loading Bronze layer data...")

customers_bronze = spark.read.format("parquet").load(f"{BRONZE_PATH}/customers")
orders_bronze = spark.read.format("parquet").load(f"{BRONZE_PATH}/orders")

log_dataframe_stats(customers_bronze, "customers_bronze", logger)
log_dataframe_stats(orders_bronze, "orders_bronze", logger)

2026-02-10 12:42:35 - __main__ - INFO - Loading Bronze layer data...
2026-02-10 12:42:37 - __main__ - INFO - DataFrame 'customers_bronze' statistics:
2026-02-10 12:42:37 - __main__ - INFO -   - Rows: 30
2026-02-10 12:42:37 - __main__ - INFO -   - Columns: 9
2026-02-10 12:42:37 - __main__ - INFO -   - Column names: customer_id, name, email, phone, signup_date, extra_info, ingestion_timestamp, source_file, bronze_layer_id
2026-02-10 12:42:38 - __main__ - INFO -   - Null counts:
2026-02-10 12:42:38 - __main__ - INFO -     customer_id: 1 (3.33%)
2026-02-10 12:42:38 - __main__ - INFO -     name: 1 (3.33%)
2026-02-10 12:42:38 - __main__ - INFO -     email: 1 (3.33%)
2026-02-10 12:42:38 - __main__ - INFO -     phone: 1 (3.33%)
2026-02-10 12:42:38 - __main__ - INFO -     signup_date: 1 (3.33%)
2026-02-10 12:42:38 - __main__ - INFO -     extra_info: 24 (80.00%)
2026-02-10 12:42:38 - __main__ - INFO - DataFrame 'orders_bronze' statistics:
2026-02-10 12:42:38 - __main__ - INFO -   - Rows: 36
2026

## Step 2: Data Quality Checks

Run quality checks to identify issues before transformation.

In [4]:
# Quality check on customers
customer_quality = check_data_quality(
    customers_bronze,
    "customers",
    required_columns=["customer_id", "name", "email"]
)

print(f"\nCustomer Data Quality Score: {customer_quality.quality_score:.2f}%")
print(f"Total Issues: {len(customer_quality.issues)}")
for issue in customer_quality.issues:
    print(f"  - {issue['type']}: {issue['column']} ({issue['count']} rows)")

Quality issue in customers.customer_id: NULL_VALUES (1 rows)
Quality issue in customers.name: NULL_VALUES (1 rows)
Quality issue in customers.email: NULL_VALUES (1 rows)



Customer Data Quality Score: 90.00%
Total Issues: 3
  - NULL_VALUES: customer_id (1 rows)
  - NULL_VALUES: name (1 rows)
  - NULL_VALUES: email (1 rows)


In [None]:
# Quality check on orders
order_quality = check_data_quality(
    orders_bronze,
    "orders",
    required_columns=["order_id", "customer_id", "order_date", "quantity", "price"]
)

print(f"\nOrder Data Quality Score: {order_quality.quality_score:.2f}%")
print(f"Total Issues: {len(order_quality.issues)}")
for issue in order_quality.issues:
    print(f"  - {issue['type']}: {issue['column']} ({issue['count']} rows) - {issue['details']}")

## Step 3: Clean and Transform Data

Apply cleaning logic from src/silver.py module.

In [5]:
# Clean customers data
customers_clean = clean_customer_data(customers_bronze)
log_dataframe_stats(customers_clean, "customers_clean", logger)

display(customers_clean.limit(5))

2026-02-10 12:42:49 - __main__ - INFO - DataFrame 'customers_clean' statistics:
2026-02-10 12:42:49 - __main__ - INFO -   - Rows: 28
2026-02-10 12:42:49 - __main__ - INFO -   - Columns: 11
2026-02-10 12:42:49 - __main__ - INFO -   - Column names: customer_id, name, email, phone, signup_date, extra_info, ingestion_timestamp, source_file, bronze_layer_id, has_email, has_phone
2026-02-10 12:42:49 - __main__ - INFO -   - Null counts:
2026-02-10 12:42:49 - __main__ - INFO -     email: 1 (3.57%)
2026-02-10 12:42:49 - __main__ - INFO -     phone: 1 (3.57%)
2026-02-10 12:42:49 - __main__ - INFO -     signup_date: 1 (3.57%)
2026-02-10 12:42:49 - __main__ - INFO -     extra_info: 24 (85.71%)


DataFrame[customer_id: string, name: string, email: string, phone: string, signup_date: date, extra_info: string, ingestion_timestamp: timestamp, source_file: string, bronze_layer_id: bigint, has_email: boolean, has_phone: boolean]

In [6]:
# Clean orders data
orders_clean = clean_order_data(orders_bronze)
log_dataframe_stats(orders_clean, "orders_clean", logger)

# Note: Status is now normalized to lowercase
display(orders_clean.select("order_id", "customer_id", "order_date", "status", "quantity", "price", "line_total").limit(10))

2026-02-10 12:42:51 - __main__ - INFO - DataFrame 'orders_clean' statistics:
2026-02-10 12:42:51 - __main__ - INFO -   - Rows: 0
2026-02-10 12:42:51 - __main__ - INFO -   - Columns: 11
2026-02-10 12:42:51 - __main__ - INFO -   - Column names: order_id, order_date, customer_id, status, quantity, price, notes, ingestion_timestamp, source_file, bronze_layer_id, line_total


TypeError: '>' not supported between instances of 'NoneType' and 'int'

## Step 4: Enrich Orders with Customer Data

In [7]:
# Join orders with customer information
orders_enriched = join_orders_with_customers(orders_clean, customers_clean)

log_dataframe_stats(orders_enriched, "orders_enriched", logger)

display(orders_enriched.limit(10))

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `city` cannot be resolved. Did you mean one of the following? [`email`, `name`, `phone`, `has_email`, `customer_id`].;
'Project [customer_id#0, name#1, email#271, 'city, 'state]
+- Project [customer_id#0, name#1, email#271, phone#3, signup_date#4, extra_info#5, ingestion_timestamp#6, source_file#7, bronze_layer_id#8L, has_email#281, (isnotnull(phone#3) AND NOT (phone#3 = )) AS has_phone#292]
   +- Project [customer_id#0, name#1, email#271, phone#3, signup_date#4, extra_info#5, ingestion_timestamp#6, source_file#7, bronze_layer_id#8L, (isnotnull(email#271) AND NOT (email#271 = )) AS has_email#281]
      +- Deduplicate [customer_id#0]
         +- Project [customer_id#0, name#1, lower(trim(email#2, None)) AS email#271, phone#3, signup_date#4, extra_info#5, ingestion_timestamp#6, source_file#7, bronze_layer_id#8L]
            +- Filter (isnotnull(name#1) AND NOT (name#1 = ))
               +- Filter (isnotnull(customer_id#0) AND NOT (customer_id#0 = ))
                  +- Relation [customer_id#0,name#1,email#2,phone#3,signup_date#4,extra_info#5,ingestion_timestamp#6,source_file#7,bronze_layer_id#8L] parquet


## Step 5: Write Silver Tables

In [None]:
# Write cleaned customers to Silver layer (Parquet format)
logger.info("Writing customers to Silver layer...")

customers_clean.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(f"{SILVER_PATH}/customers")

logger.info("‚úì Customers Silver table created")

In [None]:
# Write enriched orders to Silver layer (Parquet format)
logger.info("Writing orders to Silver layer...")

orders_enriched.write \
    .format("parquet") \
    .mode("overwrite") \
    .save(f"{SILVER_PATH}/orders")

logger.info("‚úì Orders Silver table created")

## üêõ TASK B: Debug the Monthly Revenue Calculation

The `compute_monthly_revenue()` function has bugs! 

**Expected behavior**: Calculate total revenue per month from completed orders.

**Known issues**:
1. Case-sensitive status filter missing valid orders
2. Wrong revenue calculation (not accounting for quantity)

**Your task**: 
1. Run the cell below and observe incorrect results
2. Examine the function in `src/silver.py`
3. Fix the bugs
4. Re-run and verify correct output

In [None]:
# ‚ö†Ô∏è THIS WILL PRODUCE INCORRECT RESULTS - DEBUG IT!
logger.info("Computing monthly revenue (BUGGY VERSION)...")

monthly_revenue = compute_monthly_revenue(orders_bronze)

print("\n‚ö†Ô∏è Monthly Revenue (contains bugs):")
display(monthly_revenue)

total_revenue = monthly_revenue.select(F.sum("revenue")).collect()[0][0]
print(f"\nTotal Revenue: ${total_revenue:,.2f}")
print("\n‚ùå This is INCORRECT! The function has bugs that need to be fixed.")
print("\nHints:")
print("1. Check how status is being filtered (case sensitivity)")
print("2. Verify revenue calculation includes quantity")
print("3. Look at the Bronze data - status values have mixed cases!")

In [None]:
# After fixing the bugs, test with cleaned data
# The cleaned orders have normalized status to lowercase
logger.info("Testing with cleaned Silver data...")

monthly_revenue_clean = compute_monthly_revenue(orders_clean)

print("\n‚úì Monthly Revenue (from Silver layer):")
display(monthly_revenue_clean)

total_revenue_clean = monthly_revenue_clean.select(F.sum("revenue")).collect()[0][0]
print(f"\nTotal Revenue: ${total_revenue_clean:,.2f}")

## Summary

In [None]:
total_customers = customers_clean.count()
total_orders = orders_enriched.count()

print(f"""\n{'='*50}
Silver Layer Transformation Complete
{'='*50}
Customers (cleaned): {total_customers}
Orders (enriched): {total_orders}

Data Quality:
  Customer Score: {customer_quality.quality_score:.2f}%
  Order Score: {order_quality.quality_score:.2f}%

Next Steps:
‚Üí Fix bugs in compute_monthly_revenue()
‚Üí Run notebook 03_Gold_Aggregates.ipynb for analytics
{'='*50}\n""")