# Bronze Layer: Player Profiles Ingestion

**Notebook:** `02_bronze_player_profiles`  
**Layer:** Bronze (Raw)  
**Purpose:** Ingest player profile data with PII handling considerations

---

## Overview

This notebook ingests player profile data into the Bronze Lakehouse. Player data contains PII (Personally Identifiable Information) and requires special handling for compliance.

In [None]:
# Configuration
BRONZE_LAKEHOUSE = "lh_bronze"
TABLE_NAME = "bronze_player_profiles"
SOURCE_PATH = "Files/raw/player_profiles/"
HASH_PII = True  # Hash sensitive fields at Bronze layer

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import uuid
import random

spark = SparkSession.builder.getOrCreate()

In [None]:
# Player profile schema
PLAYER_SCHEMA = StructType([
    StructField("player_id", StringType(), False),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("date_of_birth", StringType(), True),
    StructField("ssn_last_four", StringType(), True),
    StructField("address_line1", StringType(), True),
    StructField("address_line2", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip_code", StringType(), True),
    StructField("country", StringType(), True),
    StructField("loyalty_tier", StringType(), True),
    StructField("loyalty_points", IntegerType(), True),
    StructField("enrollment_date", StringType(), True),
    StructField("preferred_property", StringType(), True),
    StructField("marketing_opt_in", BooleanType(), True),
    StructField("self_exclusion_status", BooleanType(), True),
    StructField("responsible_gaming_limit", DoubleType(), True),
    StructField("vip_host_id", StringType(), True),
    StructField("created_at", StringType(), True),
    StructField("updated_at", StringType(), True)
])

In [None]:
# Generate sample player data
def generate_sample_players(num_records: int = 5000):
    first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", "Michael", "Linda"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis"]
    tiers = ["Bronze", "Silver", "Gold", "Platinum", "Diamond"]
    states = ["NV", "CA", "AZ", "UT", "CO", "NM", "TX", "FL"]
    
    data = []
    for i in range(num_records):
        data.append({
            "player_id": f"PLY{i:08d}",
            "first_name": random.choice(first_names),
            "last_name": random.choice(last_names),
            "email": f"player{i}@email.com",
            "phone": f"+1{random.randint(2000000000, 9999999999)}",
            "date_of_birth": f"{random.randint(1950, 2003)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}",
            "ssn_last_four": f"{random.randint(1000, 9999)}",
            "address_line1": f"{random.randint(100, 9999)} Main St",
            "address_line2": f"Apt {random.randint(1, 500)}" if random.random() > 0.7 else None,
            "city": "Las Vegas",
            "state": random.choice(states),
            "zip_code": f"{random.randint(10000, 99999)}",
            "country": "USA",
            "loyalty_tier": random.choice(tiers),
            "loyalty_points": random.randint(0, 500000),
            "enrollment_date": f"{random.randint(2015, 2024)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}",
            "preferred_property": f"CAS{random.randint(1, 5):03d}",
            "marketing_opt_in": random.random() > 0.3,
            "self_exclusion_status": random.random() < 0.02,
            "responsible_gaming_limit": random.choice([None, 500, 1000, 2500, 5000, 10000]),
            "vip_host_id": f"HOST{random.randint(1, 50):03d}" if random.random() > 0.8 else None,
            "created_at": datetime.now().isoformat(),
            "updated_at": datetime.now().isoformat()
        })
    
    return spark.createDataFrame(data, schema=PLAYER_SCHEMA)

df_raw = generate_sample_players(10000)
print(f"Generated {df_raw.count()} player records")

In [None]:
# Hash PII fields for Bronze layer security
def hash_pii_fields(df):
    """
    Hash sensitive PII fields while preserving original for audit.
    """
    df_hashed = df \
        .withColumn("email_hash", sha2(col("email"), 256)) \
        .withColumn("phone_hash", sha2(col("phone"), 256)) \
        .withColumn("ssn_hash", sha2(col("ssn_last_four"), 256)) \
        .withColumn("name_hash", sha2(concat_ws(" ", col("first_name"), col("last_name")), 256))
    
    return df_hashed

if HASH_PII:
    df_processed = hash_pii_fields(df_raw)
else:
    df_processed = df_raw

# Add metadata
df_bronze = df_processed \
    .withColumn("_ingestion_timestamp", current_timestamp()) \
    .withColumn("_batch_id", lit(datetime.now().strftime("%Y%m%d%H%M%S"))) \
    .withColumn("_record_hash", sha2(col("player_id"), 256))

print("PII hashing complete")

In [None]:
# Write to Bronze
df_bronze.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(TABLE_NAME)

print(f"Wrote {df_bronze.count()} records to {TABLE_NAME}")

In [None]:
# Verify
spark.table(TABLE_NAME).select(
    "player_id", "loyalty_tier", "loyalty_points", "email_hash", "_ingestion_timestamp"
).show(5, truncate=False)