In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import sys
sys.path.append('..')

from src.logging_utils import setup_logger, log_dataframe_stats
from src.quality_checks import check_data_quality
from src.silver import clean_customer_data, clean_order_data, join_orders_with_customers, compute_monthly_revenue

# Setup logger
logger = setup_logger(__name__, level="INFO")
logger.info("Starting Silver layer transformation")

In [None]:
# Initialize Spark session
try:
    spark
    logger.info("Using existing Spark session")
except NameError:
    spark = SparkSession.builder \
        .appName("Silver_Transform") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    logger.info("Created new Spark session")

# Configuration
BRONZE_PATH = "Tables/bronze"
SILVER_PATH = "Tables/silver"

## Step 1: Load Bronze Data

In [None]:
# Read Bronze tables
logger.info("Loading Bronze layer data...")

customers_bronze = spark.read.format("delta").load(f"{BRONZE_PATH}/customers")
orders_bronze = spark.read.format("delta").load(f"{BRONZE_PATH}/orders")

log_dataframe_stats(customers_bronze, "customers_bronze", logger)
log_dataframe_stats(orders_bronze, "orders_bronze", logger)

## Step 2: Data Quality Checks

Run quality checks to identify issues before transformation.

In [None]:
# Quality check on customers
customer_quality = check_data_quality(
    customers_bronze,
    "customers",
    required_columns=["customer_id", "name", "email"]
)

print(f"\nCustomer Data Quality Score: {customer_quality.quality_score:.2f}%")
print(f"Total Issues: {len(customer_quality.issues)}")
for issue in customer_quality.issues:
    print(f"  - {issue['type']}: {issue['column']} ({issue['count']} rows)")

In [None]:
# Quality check on orders
order_quality = check_data_quality(
    orders_bronze,
    "orders",
    required_columns=["order_id", "customer_id", "order_date", "quantity", "price"]
)

print(f"\nOrder Data Quality Score: {order_quality.quality_score:.2f}%")
print(f"Total Issues: {len(order_quality.issues)}")
for issue in order_quality.issues:
    print(f"  - {issue['type']}: {issue['column']} ({issue['count']} rows) - {issue['details']}")

## Step 3: Clean and Transform Data

Apply cleaning logic from src/silver.py module.

In [None]:
# Clean customers data
customers_clean = clean_customer_data(customers_bronze)
log_dataframe_stats(customers_clean, "customers_clean", logger)

display(customers_clean.limit(5))

In [None]:
# Clean orders data
orders_clean = clean_order_data(orders_bronze)
log_dataframe_stats(orders_clean, "orders_clean", logger)

# Note: Status is now normalized to lowercase
display(orders_clean.select("order_id", "customer_id", "order_date", "status", "quantity", "price", "line_total").limit(10))

## Step 4: Enrich Orders with Customer Data

In [None]:
# Join orders with customer information
orders_enriched = join_orders_with_customers(orders_clean, customers_clean)

log_dataframe_stats(orders_enriched, "orders_enriched", logger)

display(orders_enriched.limit(10))

## Step 5: Write Silver Tables

In [None]:
# Write cleaned customers to Silver layer
logger.info("Writing customers to Silver layer...")

customers_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"{SILVER_PATH}/customers")

logger.info("‚úì Customers Silver table created")

In [None]:
# Write enriched orders to Silver layer
logger.info("Writing orders to Silver layer...")

orders_enriched.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"{SILVER_PATH}/orders")

logger.info("‚úì Orders Silver table created")

## üêõ TASK B: Debug the Monthly Revenue Calculation

The `compute_monthly_revenue()` function has bugs! 

**Expected behavior**: Calculate total revenue per month from completed orders.

**Known issues**:
1. Case-sensitive status filter missing valid orders
2. Wrong revenue calculation (not accounting for quantity)

**Your task**: 
1. Run the cell below and observe incorrect results
2. Examine the function in `src/silver.py`
3. Fix the bugs
4. Re-run and verify correct output

In [None]:
# ‚ö†Ô∏è THIS WILL PRODUCE INCORRECT RESULTS - DEBUG IT!
logger.info("Computing monthly revenue (BUGGY VERSION)...")

monthly_revenue = compute_monthly_revenue(orders_bronze)

print("\n‚ö†Ô∏è Monthly Revenue (contains bugs):")
display(monthly_revenue)

total_revenue = monthly_revenue.select(F.sum("revenue")).collect()[0][0]
print(f"\nTotal Revenue: ${total_revenue:,.2f}")
print("\n‚ùå This is INCORRECT! The function has bugs that need to be fixed.")
print("\nHints:")
print("1. Check how status is being filtered (case sensitivity)")
print("2. Verify revenue calculation includes quantity")
print("3. Look at the Bronze data - status values have mixed cases!")

In [None]:
# After fixing the bugs, test with cleaned data
# The cleaned orders have normalized status to lowercase
logger.info("Testing with cleaned Silver data...")

monthly_revenue_clean = compute_monthly_revenue(orders_clean)

print("\n‚úì Monthly Revenue (from Silver layer):")
display(monthly_revenue_clean)

total_revenue_clean = monthly_revenue_clean.select(F.sum("revenue")).collect()[0][0]
print(f"\nTotal Revenue: ${total_revenue_clean:,.2f}")

## Summary

In [None]:
total_customers = customers_clean.count()
total_orders = orders_enriched.count()

print(f"""\n{'='*50}
Silver Layer Transformation Complete
{'='*50}
Customers (cleaned): {total_customers}
Orders (enriched): {total_orders}

Data Quality:
  Customer Score: {customer_quality.quality_score:.2f}%
  Order Score: {order_quality.quality_score:.2f}%

Next Steps:
‚Üí Fix bugs in compute_monthly_revenue()
‚Üí Run notebook 03_Gold_Aggregates.ipynb for analytics
{'='*50}\n""")