# Silver Layer: Slot Telemetry Cleansing & Validation

**Notebook:** `01_silver_slot_cleansing`  
**Layer:** Silver (Cleansed)  
**Purpose:** Clean, validate, and standardize slot telemetry data

---

## Overview

The Silver layer transforms raw Bronze data into clean, validated, and standardized records. This notebook implements:

- Data type enforcement
- Null handling and default values
- Deduplication
- Business rule validation
- Data quality scoring

In [None]:
# Configuration
BRONZE_TABLE = "bronze_slot_telemetry"
SILVER_TABLE = "silver_slot_telemetry"
SILVER_LAKEHOUSE = "lh_silver"

# Processing parameters
DEDUP_WINDOW_HOURS = 24
MIN_QUALITY_SCORE = 0.8

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from delta.tables import DeltaTable
from datetime import datetime, timedelta

spark = SparkSession.builder.getOrCreate()

In [None]:
# Read from Bronze
df_bronze = spark.table(BRONZE_TABLE)
print(f"Bronze records: {df_bronze.count()}")
df_bronze.printSchema()

## Data Type Enforcement

In [None]:
def enforce_data_types(df):
    """
    Enforce correct data types and handle conversions.
    """
    return df \
        .withColumn("event_timestamp_clean", to_timestamp(col("event_timestamp"))) \
        .withColumn("bet_amount_clean", col("bet_amount").cast(DoubleType())) \
        .withColumn("win_amount_clean", col("win_amount").cast(DoubleType())) \
        .withColumn("denomination_clean", col("denomination").cast(DoubleType())) \
        .withColumn("credits_wagered_clean", col("credits_wagered").cast(IntegerType())) \
        .withColumn("credits_won_clean", col("credits_won").cast(IntegerType()))

df_typed = enforce_data_types(df_bronze)
print("Data types enforced")

## Null Handling

In [None]:
def handle_nulls(df):
    """
    Handle null values with business-appropriate defaults.
    """
    return df \
        .withColumn("bet_amount_clean", coalesce(col("bet_amount_clean"), lit(0.0))) \
        .withColumn("win_amount_clean", coalesce(col("win_amount_clean"), lit(0.0))) \
        .withColumn("credits_wagered_clean", coalesce(col("credits_wagered_clean"), lit(0))) \
        .withColumn("credits_won_clean", coalesce(col("credits_won_clean"), lit(0))) \
        .withColumn("player_id_clean", coalesce(col("player_id"), lit("ANONYMOUS"))) \
        .withColumn("casino_id_clean", coalesce(col("casino_id"), lit("UNKNOWN"))) \
        .withColumn("floor_location_clean", coalesce(col("floor_location"), lit("UNKNOWN")))

df_nulls_handled = handle_nulls(df_typed)
print("Nulls handled")

## Deduplication

In [None]:
def deduplicate_records(df):
    """
    Remove duplicate records based on event_id, keeping the latest.
    """
    window = Window.partitionBy("event_id").orderBy(col("_ingestion_timestamp").desc())
    
    df_deduped = df \
        .withColumn("_row_num", row_number().over(window)) \
        .filter(col("_row_num") == 1) \
        .drop("_row_num")
    
    return df_deduped

df_deduped = deduplicate_records(df_nulls_handled)
print(f"After deduplication: {df_deduped.count()} records")

## Business Rule Validation

In [None]:
def apply_business_rules(df):
    """
    Apply business validation rules and flag issues.
    """
    return df \
        .withColumn("is_valid_bet", 
            (col("bet_amount_clean") >= 0) & 
            (col("bet_amount_clean") <= 10000)) \
        .withColumn("is_valid_win", 
            col("win_amount_clean") >= 0) \
        .withColumn("is_valid_timestamp", 
            col("event_timestamp_clean").isNotNull() & 
            (col("event_timestamp_clean") <= current_timestamp())) \
        .withColumn("is_large_win", 
            col("win_amount_clean") >= 1200)  # W-2G threshold

df_validated = apply_business_rules(df_deduped)
print("Business rules applied")

## Data Quality Scoring

In [None]:
def calculate_quality_score(df):
    """
    Calculate a data quality score for each record.
    """
    return df.withColumn("quality_score",
        (col("is_valid_bet").cast("int") +
         col("is_valid_win").cast("int") +
         col("is_valid_timestamp").cast("int") +
         when(col("machine_id").isNotNull(), 1).otherwise(0) +
         when(col("event_type").isNotNull(), 1).otherwise(0)
        ) / 5.0
    )

df_scored = calculate_quality_score(df_validated)
print("Quality scores calculated")
df_scored.select("quality_score").describe().show()

## Create Silver Output

In [None]:
# Select and rename columns for Silver schema
df_silver = df_scored.select(
    col("event_id"),
    col("machine_id"),
    col("casino_id_clean").alias("casino_id"),
    col("floor_location_clean").alias("floor_location"),
    col("event_timestamp_clean").alias("event_timestamp"),
    col("event_type"),
    col("denomination_clean").alias("denomination"),
    col("bet_amount_clean").alias("bet_amount"),
    col("win_amount_clean").alias("win_amount"),
    col("jackpot_contribution"),
    col("credits_wagered_clean").alias("credits_wagered"),
    col("credits_won_clean").alias("credits_won"),
    col("player_id_clean").alias("player_id"),
    col("session_id"),
    col("is_bonus_round"),
    col("game_outcome"),
    col("is_valid_bet"),
    col("is_valid_win"),
    col("is_large_win"),
    col("quality_score"),
    col("_ingestion_timestamp").alias("bronze_ingestion_timestamp"),
    current_timestamp().alias("silver_processed_timestamp"),
    year(col("event_timestamp_clean")).alias("year"),
    month(col("event_timestamp_clean")).alias("month"),
    dayofmonth(col("event_timestamp_clean")).alias("day")
)

# Filter by quality score
df_silver_filtered = df_silver.filter(col("quality_score") >= MIN_QUALITY_SCORE)

print(f"Silver records (quality >= {MIN_QUALITY_SCORE}): {df_silver_filtered.count()}")

In [None]:
# Write to Silver lakehouse
df_silver_filtered.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("year", "month", "day") \
    .option("overwriteSchema", "true") \
    .saveAsTable(SILVER_TABLE)

print(f"Wrote to {SILVER_TABLE}")

In [None]:
# Quality report
print("\n" + "="*50)
print("SILVER LAYER QUALITY REPORT")
print("="*50)
print(f"Input records (Bronze): {df_bronze.count()}")
print(f"After deduplication: {df_deduped.count()}")
print(f"Output records (Silver): {df_silver_filtered.count()}")
print(f"Records filtered by quality: {df_deduped.count() - df_silver_filtered.count()}")
print(f"\nLarge wins (>=$1,200): {df_silver_filtered.filter(col('is_large_win')).count()}")
print("="*50)