# ML: Player Churn Prediction Model

**Notebook:** `01_player_churn_prediction`  
**Type:** Machine Learning  
**Purpose:** Predict player churn risk using historical behavior patterns

---

## Overview

This notebook demonstrates building a player churn prediction model using Fabric's ML capabilities. The model identifies players at risk of becoming inactive, enabling proactive retention efforts.

### Business Value
- Early identification of at-risk players
- Targeted retention campaigns
- Improved player lifetime value
- Optimized marketing spend

In [None]:
# Configuration
GOLD_LAKEHOUSE = "lh_gold"
MODEL_NAME = "player_churn_model"
CHURN_DAYS_THRESHOLD = 30  # Player inactive for 30+ days = churned

In [None]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
)
from pyspark.ml.classification import (
    RandomForestClassifier, LogisticRegression, GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import mlflow
import mlflow.spark
from datetime import datetime, timedelta

spark = SparkSession.builder.getOrCreate()

# Enable MLflow autologging
mlflow.spark.autolog()

## Feature Engineering

In [None]:
def create_player_features(snapshot_date: str = None):
    """
    Create feature dataset for churn prediction.
    
    Features include:
    - Recency: Days since last visit
    - Frequency: Visit count in last 90 days
    - Monetary: Total coin-in in last 90 days
    - Behavioral: Win rate, avg session length, preferred games
    """
    if snapshot_date is None:
        snapshot_date = datetime.now().strftime("%Y-%m-%d")
    
    # Read slot telemetry
    df_slots = spark.table("silver_slot_telemetry") \
        .filter(col("player_id") != "ANONYMOUS") \
        .filter(col("event_type") == "SPIN")
    
    # Read player dimension
    df_player = spark.table("dim_player") \
        .filter(col("is_current") == True)
    
    # Calculate player-level metrics
    df_features = df_slots.groupBy("player_id").agg(
        # Recency
        datediff(lit(snapshot_date), max("event_timestamp")).alias("days_since_last_visit"),
        
        # Frequency (last 90 days)
        count(when(col("event_timestamp") >= date_sub(lit(snapshot_date), 90), 1)).alias("visits_90d"),
        countDistinct(when(col("event_timestamp") >= date_sub(lit(snapshot_date), 90), 
                          to_date("event_timestamp"))).alias("active_days_90d"),
        
        # Monetary
        sum(when(col("event_timestamp") >= date_sub(lit(snapshot_date), 90), 
                col("bet_amount")).otherwise(0)).alias("coin_in_90d"),
        sum(when(col("event_timestamp") >= date_sub(lit(snapshot_date), 90), 
                col("win_amount")).otherwise(0)).alias("coin_out_90d"),
        
        # Behavioral
        avg("bet_amount").alias("avg_bet_amount"),
        stddev("bet_amount").alias("bet_volatility"),
        count(when(col("win_amount") > 0, 1)).alias("win_count"),
        count("*").alias("total_spins"),
        countDistinct("machine_id").alias("unique_machines"),
        countDistinct("session_id").alias("total_sessions"),
        
        # Time patterns
        min("event_timestamp").alias("first_visit"),
        max("event_timestamp").alias("last_visit")
    )
    
    # Calculate derived features
    df_features = df_features \
        .withColumn("win_rate", col("win_count") / col("total_spins")) \
        .withColumn("net_result_90d", col("coin_out_90d") - col("coin_in_90d")) \
        .withColumn("avg_spins_per_session", col("total_spins") / col("total_sessions")) \
        .withColumn("tenure_days", datediff(col("last_visit"), col("first_visit"))) \
        .withColumn("visit_frequency", col("active_days_90d") / 90.0)
    
    # Join with player dimension
    df_features = df_features.join(
        df_player.select("player_id", "loyalty_tier", "is_vip"),
        "player_id",
        "left"
    )
    
    # Create churn label (1 = churned, 0 = active)
    df_features = df_features.withColumn(
        "churned",
        when(col("days_since_last_visit") >= CHURN_DAYS_THRESHOLD, 1).otherwise(0)
    )
    
    return df_features

# Create features
df_features = create_player_features()
print(f"Created features for {df_features.count()} players")
df_features.printSchema()

In [None]:
# Check class balance
df_features.groupBy("churned").count().show()

# Feature statistics
df_features.select(
    "days_since_last_visit", "visits_90d", "coin_in_90d",
    "win_rate", "avg_bet_amount", "tenure_days"
).describe().show()

## Model Training

In [None]:
# Prepare data for ML

# Handle nulls
df_ml = df_features.na.fill({
    "bet_volatility": 0,
    "loyalty_tier": "Unknown",
    "is_vip": False
})

# Feature columns
numeric_features = [
    "days_since_last_visit", "visits_90d", "active_days_90d",
    "coin_in_90d", "coin_out_90d", "avg_bet_amount", "bet_volatility",
    "win_rate", "total_spins", "unique_machines", "total_sessions",
    "avg_spins_per_session", "tenure_days", "visit_frequency"
]

categorical_features = ["loyalty_tier"]

# Train/test split
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)
print(f"Training set: {train_df.count()} records")
print(f"Test set: {test_df.count()} records")

In [None]:
# Build ML pipeline

# Index categorical features
tier_indexer = StringIndexer(
    inputCol="loyalty_tier",
    outputCol="loyalty_tier_idx",
    handleInvalid="keep"
)

tier_encoder = OneHotEncoder(
    inputCol="loyalty_tier_idx",
    outputCol="loyalty_tier_vec"
)

# Assemble features
assembler = VectorAssembler(
    inputCols=numeric_features + ["loyalty_tier_vec"],
    outputCol="features_raw"
)

# Scale features
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

# Random Forest classifier
rf = RandomForestClassifier(
    labelCol="churned",
    featuresCol="features",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Create pipeline
pipeline = Pipeline(stages=[
    tier_indexer,
    tier_encoder,
    assembler,
    scaler,
    rf
])

In [None]:
# Train model with MLflow tracking
with mlflow.start_run(run_name="player_churn_rf"):
    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("num_trees", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("churn_threshold_days", CHURN_DAYS_THRESHOLD)
    
    # Train
    print("Training model...")
    model = pipeline.fit(train_df)
    
    # Predict on test set
    predictions = model.transform(test_df)
    
    # Evaluate
    evaluator = BinaryClassificationEvaluator(
        labelCol="churned",
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    )
    
    auc = evaluator.evaluate(predictions)
    print(f"AUC-ROC: {auc:.4f}")
    
    # Log metrics
    mlflow.log_metric("auc_roc", auc)
    
    # Log model
    mlflow.spark.log_model(model, "model")
    
    print("Model logged to MLflow")

## Model Evaluation

In [None]:
# Confusion matrix
predictions.groupBy("churned", "prediction").count() \
    .orderBy("churned", "prediction").show()

# Calculate precision, recall
tp = predictions.filter((col("churned") == 1) & (col("prediction") == 1)).count()
fp = predictions.filter((col("churned") == 0) & (col("prediction") == 1)).count()
fn = predictions.filter((col("churned") == 1) & (col("prediction") == 0)).count()
tn = predictions.filter((col("churned") == 0) & (col("prediction") == 0)).count()

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
# Feature importance
rf_model = model.stages[-1]
feature_importance = list(zip(
    numeric_features + ["loyalty_tier"],
    rf_model.featureImportances.toArray()
))
feature_importance.sort(key=lambda x: x[1], reverse=True)

print("\nTop 10 Feature Importance:")
for feature, importance in feature_importance[:10]:
    print(f"  {feature}: {importance:.4f}")

## Score Players

In [None]:
# Score all players
df_scored = model.transform(df_ml)

# Extract churn probability
from pyspark.ml.functions import vector_to_array

df_scores = df_scored.select(
    "player_id",
    "loyalty_tier",
    "days_since_last_visit",
    "coin_in_90d",
    "prediction",
    vector_to_array("probability")[1].alias("churn_probability")
) \
.withColumn("risk_tier",
    when(col("churn_probability") >= 0.8, "Critical")
    .when(col("churn_probability") >= 0.6, "High")
    .when(col("churn_probability") >= 0.4, "Medium")
    .otherwise("Low")
)

# Show high-risk players
print("High-Risk Players:")
df_scores.filter(col("risk_tier").isin(["Critical", "High"])) \
    .orderBy(col("churn_probability").desc()) \
    .show(20)

In [None]:
# Save scores to Gold layer
df_scores.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold.player_churn_scores")

print("Churn scores saved to gold.player_churn_scores")

## Summary

In [None]:
# Risk tier summary
summary = df_scores.groupBy("risk_tier").agg(
    count("*").alias("player_count"),
    sum("coin_in_90d").alias("total_coin_in_90d"),
    avg("churn_probability").alias("avg_churn_prob")
).orderBy(col("avg_churn_prob").desc())

print("\n" + "="*50)
print("CHURN RISK SUMMARY")
print("="*50)
summary.show()