In [None]:
# Notebook: 01_bronze_to_silver
# Purpose: Clean and transform Bronze data to Silver layer

from pyspark.sql.functions import *
import os

# Configuration
storage_account = "dota2lakehousenew"
container = "data"
storage_account_key = os.environ.get("AZURE_STORAGE_KEY")

spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    storage_account_key
)

BRONZE_PATH = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze"
SILVER_PATH = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver"

print("=" * 60)
print("BRONZE TO SILVER DATA PROCESSING")
print("=" * 60)


In [None]:
# 1. LOAD RAW DATA FROM BRONZE
print("\n[1/6] Loading data from Bronze layer...")

df_matches_raw = spark.read.csv(
    f"{BRONZE_PATH}/main_metadata.csv",
    header=True,
    inferSchema=True
)

df_players_raw = spark.read.csv(
    f"{BRONZE_PATH}/players_reduced.csv",
    header=True,
    inferSchema=True
)

df_picks_bans_raw = spark.read.csv(
    f"{BRONZE_PATH}/picks_bans.csv",
    header=True,
    inferSchema=True
)

df_teams_raw = spark.read.csv(
    f"{BRONZE_PATH}/teams.csv",
    header=True,
    inferSchema=True
)

print(f"   Matches loaded: {df_matches_raw.count():,} rows, {len(df_matches_raw.columns)} columns")
print(f"   Players loaded: {df_players_raw.count():,} rows, {len(df_players_raw.columns)} columns")
print(f"   Picks/Bans loaded: {df_picks_bans_raw.count():,} rows, {len(df_picks_bans_raw.columns)} columns")
print(f"   Teams loaded: {df_teams_raw.count():,} rows, {len(df_teams_raw.columns)} columns")

In [None]:
# 2. DATA QUALITY ASSESSMENT
print("\n[2/6] Data Quality Assessment...")

def analyze_nulls(df, name):
    print(f"\n  --- {name} ---")
    total_rows = df.count()
    null_cols = []
    for col_name in df.columns:
        null_count = df.filter(col(col_name).isNull()).count()
        if null_count > 0:
            pct = (null_count / total_rows) * 100
            null_cols.append((col_name, null_count, pct))
    
    if null_cols:
        print(f"  Columns with nulls (top 10):")
        for c, cnt, pct in sorted(null_cols, key=lambda x: -x[1])[:10]:
            print(f"    {c}: {cnt:,} ({pct:.1f}%)")
    else:
        print(f"  No null values found!")
    return null_cols

matches_nulls = analyze_nulls(df_matches_raw, "Matches")
players_nulls = analyze_nulls(df_players_raw, "Players")

dup_matches = df_matches_raw.count() - df_matches_raw.dropDuplicates(["match_id"]).count()
dup_players = df_players_raw.count() - df_players_raw.dropDuplicates(["match_id", "player_slot"]).count()

print(f"\n  Duplicate matches: {dup_matches}")
print(f"  Duplicate player records: {dup_players}")

In [None]:
# 3. CLEAN MATCHES DATA
print("\n[3/6] Cleaning Matches data...")

df_matches_clean = df_matches_raw \
    .drop("Unnamed: 0") \
    .dropDuplicates(["match_id"]) \
    .filter(col("match_id").isNotNull()) \
    .filter(col("duration").isNotNull()) \
    .filter(col("duration") > 300) \
    .filter(col("duration") < 7200)

df_matches_clean = df_matches_clean \
    .withColumn("duration_minutes", round(col("duration") / 60, 2)) \
    .withColumn("radiant_win", col("radiant_win").cast("boolean")) \
    .withColumn("match_date", to_date(col("start_date_time"))) \
    .withColumn("match_hour", hour(col("start_date_time"))) \
    .withColumn("match_day_of_week", dayofweek(col("start_date_time"))) \
    .withColumn("total_kills", col("radiant_score") + col("dire_score")) \
    .withColumn("kill_difference", abs(col("radiant_score") - col("dire_score"))) \
    .withColumn("is_stomp",
        when(col("stomp") == 1.0, True)
        .when(col("kill_difference") > 30, True)
        .otherwise(False)
    ) \
    .withColumn("ingestion_timestamp", current_timestamp())

matches_silver_cols = [
    "match_id", "duration", "duration_minutes", "radiant_win",
    "radiant_score", "dire_score", "total_kills", "kill_difference",
    "first_blood_time", "game_mode", "lobby_type", "leagueid",
    "match_date", "match_hour", "match_day_of_week",
    "radiant_team_id", "dire_team_id", "region", "patch",
    "is_stomp", "throw", "comeback", "ingestion_timestamp"
]

existing_cols = [c for c in matches_silver_cols if c in df_matches_clean.columns]
df_matches_silver = df_matches_clean.select(existing_cols)

print(f"   Matches before cleaning: {df_matches_raw.count():,}")
print(f"   Matches after cleaning: {df_matches_silver.count():,}")
print(f"   Removed: {df_matches_raw.count() - df_matches_silver.count():,} rows")

In [None]:
# 4. CLEAN PLAYERS DATA
print("\n[4/6] Cleaning Players data...")

player_cols_to_keep = [
    "match_id", "player_slot", "account_id", "hero_id",
    "kills", "deaths", "assists", "kda",
    "gold_per_min", "xp_per_min", "net_worth", "total_gold", "total_xp",
    "hero_damage", "hero_healing", "tower_damage",
    "last_hits", "denies", "level",
    "isRadiant", "win", "lose",
    "lane", "lane_role", "is_roaming",
    "camps_stacked", "stuns", "teamfight_participation",
    "observer_kills", "sentry_kills",
    "personaname", "rank_tier"
]

existing_player_cols = [c for c in player_cols_to_keep if c in df_players_raw.columns]

df_players_clean = df_players_raw \
    .drop("Unnamed: 0") \
    .select(existing_player_cols) \
    .dropDuplicates(["match_id", "player_slot"]) \
    .filter(col("match_id").isNotNull())

df_players_clean = df_players_clean \
    .withColumn("kills", col("kills").cast("int")) \
    .withColumn("deaths", col("deaths").cast("int")) \
    .withColumn("assists", col("assists").cast("int")) \
    .withColumn("kda", col("kda").cast("double")) \
    .withColumn("gold_per_min", col("gold_per_min").cast("double")) \
    .withColumn("xp_per_min", col("xp_per_min").cast("double")) \
    .withColumn("hero_damage", col("hero_damage").cast("long")) \
    .withColumn("tower_damage", col("tower_damage").cast("long")) \
    .withColumn("win", col("win").cast("int")) \
    .fillna({"kills": 0, "deaths": 0, "assists": 0, "kda": 0})

df_players_clean = df_players_clean \
    .withColumn("kda_calculated",
        when(col("deaths") == 0, col("kills") + col("assists"))
        .otherwise(round((col("kills") + col("assists")) / col("deaths"), 2))
    )

df_players_silver = df_players_clean \
    .withColumn("ingestion_timestamp", current_timestamp())

print(f"   Players before cleaning: {df_players_raw.count():,}")
print(f"   Players after cleaning: {df_players_silver.count():,}")
print(f"   Columns reduced: {len(df_players_raw.columns)} â†’ {len(df_players_silver.columns)}")

In [None]:
# 5. CLEAN PICKS/BANS DATA
print("\n[5/6] Cleaning Picks/Bans data...")

df_picks_bans_clean = df_picks_bans_raw \
    .drop("Unnamed: 0") \
    .filter(col("match_id").isNotNull()) \
    .filter(col("hero_id").isNotNull()) \
    .withColumn("is_pick", col("is_pick").cast("boolean")) \
    .withColumn("hero_id", col("hero_id").cast("int")) \
    .withColumn("team", col("team").cast("int")) \
    .withColumn("pick_order", col("order").cast("int")) \
    .drop("order", "ord") \
    .withColumn("ingestion_timestamp", current_timestamp())

print(f"   Picks/Bans cleaned: {df_picks_bans_clean.count():,} rows")

In [None]:
# 6. OUTLIER DETECTION
print("\n[5.5/6] Outlier Detection...")

stats = df_players_silver.select(
    mean("kills").alias("mean_kills"),
    stddev("kills").alias("std_kills"),
    mean("deaths").alias("mean_deaths"),
    stddev("deaths").alias("std_deaths"),
    mean("gold_per_min").alias("mean_gpm"),
    stddev("gold_per_min").alias("std_gpm")
).collect()[0]

print(f"  Player Statistics:")
print(f"    Kills: mean={stats['mean_kills']:.2f}, std={stats['std_kills']:.2f}")
print(f"    Deaths: mean={stats['mean_deaths']:.2f}, std={stats['std_deaths']:.2f}")
print(f"    GPM: mean={stats['mean_gpm']:.2f}, std={stats['std_gpm']:.2f}")

df_players_silver = df_players_silver.withColumn(
    "is_outlier",
    when(
        (col("kills") > stats["mean_kills"] + 3 * stats["std_kills"]) |
        (col("deaths") > stats["mean_deaths"] + 3 * stats["std_deaths"]) |
        (col("gold_per_min") > stats["mean_gpm"] + 3 * stats["std_gpm"]),
        True
    ).otherwise(False)
)

outlier_count = df_players_silver.filter(col("is_outlier") == True).count()
print(f"   Outliers detected: {outlier_count:,} ({outlier_count/df_players_silver.count()*100:.2f}%)")

In [None]:
# 7. WRITE TO SILVER LAYER AS DELTA
print("\n[6/6] Writing to Silver Layer...")

df_matches_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"{SILVER_PATH}/cleaned_matches")
print(f"   Matches written to: {SILVER_PATH}/cleaned_matches")

df_players_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"{SILVER_PATH}/cleaned_players")
print(f"   Players written to: {SILVER_PATH}/cleaned_players")

df_picks_bans_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"{SILVER_PATH}/cleaned_picks_bans")
print(f"   Picks/Bans written to: {SILVER_PATH}/cleaned_picks_bans")

print("\n" + "=" * 60)
print("SILVER LAYER COMPLETE!")
print("=" * 60)

In [None]:
print("\n[7/7] Exporting CSV Samples from Silver Layer...")

from pyspark.sql.functions import rand
from datetime import datetime

SAMPLE_SIZE_SILVER = 1000
SAMPLES_PATH = f"{SILVER_PATH}/samples"
timestamp = datetime.now().strftime("%Y%m%d")

print(f"\n  Exporting {SAMPLE_SIZE_SILVER} matches...")
matches_sample = df_matches_silver.orderBy(rand(seed=42)).limit(SAMPLE_SIZE_SILVER)
matches_sample_file = f"{SAMPLES_PATH}/silver_cleaned_matches_sample_{SAMPLE_SIZE_SILVER}rows_{timestamp}.csv"
matches_sample.coalesce(1).write.mode("overwrite").option("header", "true").csv(matches_sample_file)
print(f"    Written to: {matches_sample_file}")

print(f"\n  Exporting {SAMPLE_SIZE_SILVER} player records...")
players_sample = df_players_silver.orderBy(rand(seed=42)).limit(SAMPLE_SIZE_SILVER)
players_sample_file = f"{SAMPLES_PATH}/silver_cleaned_players_sample_{SAMPLE_SIZE_SILVER}rows_{timestamp}.csv"
players_sample.coalesce(1).write.mode("overwrite").option("header", "true").csv(players_sample_file)
print(f"    Written to: {players_sample_file}")

print(f"\n  Exporting {SAMPLE_SIZE_SILVER} picks/bans records...")
picks_bans_sample = df_picks_bans_clean.orderBy(rand(seed=42)).limit(SAMPLE_SIZE_SILVER)
picks_bans_sample_file = f"{SAMPLES_PATH}/silver_cleaned_picks_bans_sample_{SAMPLE_SIZE_SILVER}rows_{timestamp}.csv"
picks_bans_sample.coalesce(1).write.mode("overwrite").option("header", "true").csv(picks_bans_sample_file)
print(f"    Written to: {picks_bans_sample_file}")

print("\n  Silver samples exported successfully!")
print(f"  Location: {SAMPLES_PATH}")

In [None]:
print("\n[Verification] Reading back from Silver...")

df_verify_matches = spark.read.format("delta").load(f"{SILVER_PATH}/cleaned_matches")
df_verify_players = spark.read.format("delta").load(f"{SILVER_PATH}/cleaned_players")

print(f"  Silver Matches: {df_verify_matches.count():,} rows")
print(f"  Silver Players: {df_verify_players.count():,} rows")

print("\n  Sample Matches:")
df_verify_matches.select("match_id", "duration_minutes", "radiant_win", "total_kills", "match_date").show(5)

print("\n  Sample Players:")
df_verify_players.select("match_id", "account_id", "hero_id", "kills", "deaths", "assists", "kda", "win").show(5)

print("\n  CSV Samples created:")
try:
    sample_files = dbutils.fs.ls(f"{SILVER_PATH}/samples")
    for f in sample_files:
        if f.name.endswith('.csv/'):
            print(f"    {f.name}")
except Exception as e:
    print(f"    (No samples directory yet)")