In [1]:
import os
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import mlflow
import pandas as pd
from datetime import datetime


## Spark Initialization

In [2]:
spark = (
    SparkSession.builder
    .appName("churn_inference_notebook")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")  # faster for local debugging
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
print("âœ… Spark started")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/05 02:31:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


âœ… Spark started


## Load features from feature store

In [9]:
def load_features(spark, snapshot_date_str):
    """Load inference features for a given snapshot date."""
    snapshot_date = datetime.strptime(snapshot_date_str, "%Y-%m-%d").date()

    path = "/app/datamart/gold/feature_store/"
    print(f"ðŸ“‚ Loading feature store: {path}")

    df = (
        spark.read.parquet(path)
        .filter(F.col("snapshot_date") == F.lit(snapshot_date))
    )

    print(f"âœ… Loaded features: {df.count()} rows for snapshot_date={snapshot_date}")
    return df


## Load MLflow Model

def load_mlflow_model(model_name):
    """
    Load an MLflow model by registered name or full URI.
    """
    mlflow.set_tracking_uri("http://mlflow:5000")

    if model_name.startswith("models:/"):
        model_uri = model_name
    else:
        model_uri = f"models:/{model_name}/Production"

    logger.info(f"Loading MLflow model from: {model_uri}")

    try:
        model = mlflow.sklearn.load_model(model_uri)
        logger.info("âœ… MLflow model loaded successfully")
        return model
    except Exception as e:
        logger.error(f"Failed to load MLflow model: {e}")
        raise


In [4]:
def load_mlflow_model(model_name_or_uri):
    """Load MLflow model by registered name or full model URI."""
    
    print(f"ðŸ“¦ Loading MLflow model: {model_name_or_uri}")

    # model = mlflow.pyfunc.load_model(model_name_or_uri)
    model = mlflow.sklearn.load_model(model_uri)

    print("âœ… Model loaded successfully!")
    return model


25/11/05 02:31:40 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Save predictions to datamart

def save_predictions(spark, df_predictions, model_name, snapshot_date_str):
    """
    Save predictions to parquet under:
    datamart/gold/model_predictions/<model_name>/
    """
    base_dir = f"datamart/gold/model_predictions/{model_name}/"
    os.makedirs(base_dir, exist_ok=True)

    filename = f"{model_name}_predictions_{snapshot_date_str.replace('-', '_')}.parquet"
    filepath = os.path.join(base_dir, filename)

    (
        spark.createDataFrame(df_predictions)
            .write.mode("overwrite")
            .parquet(filepath)
    )

    logger.info(f"âœ… Predictions saved: {filepath}")


In [5]:
def save_predictions(spark, predictions_pdf, model_name, snapshot_date_str):
    """Save inference prediction results to parquet."""
    
    output_dir = "/app/datamart/gold/inference_output/"
    os.makedirs(output_dir, exist_ok=True)

    output_path = os.path.join(
        output_dir,
        f"{model_name}_predictions_{snapshot_date_str.replace('-', '')}.parquet"
    )

    # Convert pandas â†’ spark
    preds_sdf = spark.createDataFrame(predictions_pdf)

    preds_sdf.write.mode("overwrite").parquet(output_path)

    print(f"âœ… Predictions saved to: {output_path}")


## Main Inference Pipeline

def main(snapshot_date_str, model_name):

    logger.info("=== Starting Model Inference Job ===")

    # Spark session
    spark = pyspark.sql.SparkSession.builder \
        .appName("inference") \
        .master("local[*]") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    # Load features
    features_sdf = load_features(spark, snapshot_date_str)

    logger.info("Feature schema:")
    features_sdf.printSchema()

    # Convert to pandas
    features_pdf = features_sdf.toPandas()
    logger.info(f"Converted Spark â†’ Pandas: shape={features_pdf.shape}")

    # Extract feature columns
    # feature_cols = [c for c in features_pdf.columns if c.startswith("fe_")]
    feature_cols = ['tenure_days_at_snapshot', 'registered_via', 'city_clean', 
                'sum_secs_w30', 'active_days_w30', 'complete_rate_w30', 
                'sum_secs_w7', 'engagement_ratio_7_30', 'days_since_last_play', 
                'trend_secs_w30', 'auto_renew_share', 'last_is_auto_renew']
    X_inference = features_pdf[feature_cols]

    # Load MLflow model
    model = load_mlflow_model(model_name)

    # Predict
    y_proba = model.predict_proba(X_inference)[:, 1]

    # Output dataframe
    output = features_pdf[["msno", "snapshot_date"]].copy()
    output["model_name"] = model_name
    output["model_predictions"] = y_proba

    # Save
    save_predictions(spark, output, model_name, snapshot_date_str)

    spark.stop()
    logger.info("=== Inference Job Completed ===")


In [7]:
def run_inference(snapshot_date_str, model_uri):
    """
    Full inference pipeline:
    - load features
    - convert to pandas
    - load MLflow model
    - predict_proba
    - save output parquet
    """

    # --- Load features ---
    features_sdf = load_features(spark, snapshot_date_str)

    # Convert to pandas for ML model
    features_pdf = features_sdf.toPandas()

    # Identify feature columns (assuming fe_ prefix)
    # feature_cols = [c for c in features_pdf.columns if c.startswith("fe_")]
    feature_cols = ['tenure_days_at_snapshot',
                'registered_via',
                'city_clean', 
                'sum_secs_w30',
                'active_days_w30',
                'complete_rate_w30',
                'sum_secs_w7',
                'engagement_ratio_7_30',
                'days_since_last_play',
                'trend_secs_w30',
                'auto_renew_share',
                'last_is_auto_renew']
    X = features_pdf[feature_cols]

    # --- Load MLflow model ---
    model = load_mlflow_model(model_uri)

    # --- Inference ---
    preds = model.predict(X)
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X)[:, 1]
    else:
        proba = preds  # fallback for regressors / non-proba models

    # --- Build output dataframe ---
    output = features_pdf[["msno", "snapshot_date"]].copy()
    output["model_name"] = model_uri
    output["prediction"] = preds
    output["probability"] = proba

    # --- Save to parquet ---
    save_predictions(spark, output, model_uri.replace("/", "_"), snapshot_date_str)

    return output


## Data Preparation: Handle Missing Values & Encoding

**Strategy**:
1. Create missing value indicators
2. Fill missing values with 0
3. One-hot encode for Logistic Regression (with scaling)
4. Keep original encoding for tree-based models

In [7]:
from sklearn.preprocessing import StandardScaler

# Identify categorical and numerical columns
categorical_cols = ['registered_via', 'city_clean']
numerical_cols = [col for col in feature_cols if col not in categorical_cols]

# Define feature groups for missing indicators
activity_features = ['sum_secs_w30', 'active_days_w30', 'complete_rate_w30', 
                     'sum_secs_w7', 'engagement_ratio_7_30', 'days_since_last_play', 
                     'trend_secs_w30']
demo_features = ['tenure_days_at_snapshot', 'registered_via', 'city_clean']

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

Categorical columns: ['registered_via', 'city_clean']
Numerical columns: ['tenure_days_at_snapshot', 'sum_secs_w30', 'active_days_w30', 'complete_rate_w30', 'sum_secs_w7', 'engagement_ratio_7_30', 'days_since_last_play', 'trend_secs_w30', 'auto_renew_share', 'last_is_auto_renew']


### Step 1: Create Missing Value Indicators

In [8]:
print("\n[STEP 1] Creating missing value indicator features...")

# Create indicator features for ALL splits
for df in [X_train, X_val, X_test, X_oot]:
    # Indicator for missing activity features
    df['is_missing_activity'] = df['sum_secs_w30'].isnull().astype(int)
    
    # Indicator for missing demographic features
    df['is_missing_demo'] = df['tenure_days_at_snapshot'].isnull().astype(int)

print(f"  âœ“ Created 'is_missing_activity' indicator")
print(f"  âœ“ Created 'is_missing_demo' indicator")
print(f"  Train - Missing activity: {X_train['is_missing_activity'].sum():,} ({X_train['is_missing_activity'].mean():.1%})")
print(f"  Train - Missing demo: {X_train['is_missing_demo'].sum():,} ({X_train['is_missing_demo'].mean():.1%})")


[STEP 1] Creating missing value indicator features...
  âœ“ Created 'is_missing_activity' indicator
  âœ“ Created 'is_missing_demo' indicator
  Train - Missing activity: 268,550 (18.8%)
  Train - Missing demo: 140,268 (9.8%)


### Step 2: Fill Missing Values

In [9]:
print("\n[STEP 2] Filling missing values with 0...")

# Fill missing values with 0 (after creating indicators)
for df in [X_train, X_val, X_test, X_oot]:
    df.fillna(0, inplace=True)

print("  âœ“ All missing values filled with 0")

# Verify no missing values remain
print(f"  Train missing values: {X_train.isnull().sum().sum()}")
print(f"  Val missing values: {X_val.isnull().sum().sum()}")
print(f"  Test missing values: {X_test.isnull().sum().sum()}")
print(f"  OOT missing values: {X_oot.isnull().sum().sum()}")


[STEP 2] Filling missing values with 0...
  âœ“ All missing values filled with 0
  Train missing values: 0
  Val missing values: 0
  Test missing values: 0
  OOT missing values: 0


### Step 3: Prepare Data for Logistic Regression (One-Hot Encoding + Scaling)

In [10]:
print("\n[STEP 3] One-hot encoding categorical features for Logistic Regression...")

# Get dummies for registered_via and city_clean
X_train_lr = pd.get_dummies(X_train, columns=['registered_via', 'city_clean'], 
                             drop_first=True, dtype=int)
X_val_lr = pd.get_dummies(X_val, columns=['registered_via', 'city_clean'], 
                           drop_first=True, dtype=int)
X_test_lr = pd.get_dummies(X_test, columns=['registered_via', 'city_clean'], 
                            drop_first=True, dtype=int)
X_oot_lr = pd.get_dummies(X_oot, columns=['registered_via', 'city_clean'], 
                           drop_first=True, dtype=int)

# Align columns across all datasets (handle unseen categories)
all_columns = X_train_lr.columns
for df in [X_val_lr, X_test_lr, X_oot_lr]:
    # Add missing columns
    for col in all_columns:
        if col not in df.columns:
            df[col] = 0

# Reassign to ensure column alignment
X_val_lr = X_val_lr[all_columns]
X_test_lr = X_test_lr[all_columns]
X_oot_lr = X_oot_lr[all_columns]

print(f"  âœ“ One-hot encoded 'registered_via' and 'city_clean'")
print(f"  âœ“ Total features after encoding: {X_train_lr.shape[1]}")


[STEP 3] One-hot encoding categorical features for Logistic Regression...
  âœ“ One-hot encoded 'registered_via' and 'city_clean'
  âœ“ Total features after encoding: 37


### Step 4: Feature Scaling for Logistic Regression

In [11]:
print("\n[STEP 4] Scaling numeric features with StandardScaler...")

# Identify numeric columns (exclude one-hot encoded columns and binary indicators)
numeric_cols = [col for col in X_train_lr.columns 
                if not col.startswith('registered_via_') 
                and not col.startswith('city_clean_')
                and col not in ['is_missing_activity', 'is_missing_demo', 'last_is_auto_renew']]

print(f"  Scaling {len(numeric_cols)} numeric features: {numeric_cols}")

# Initialize scaler
scaler = StandardScaler()

# Fit on training data only
X_train_lr[numeric_cols] = scaler.fit_transform(X_train_lr[numeric_cols])

# Transform validation, test, and OOT using the same scaler
X_val_lr[numeric_cols] = scaler.transform(X_val_lr[numeric_cols])
X_test_lr[numeric_cols] = scaler.transform(X_test_lr[numeric_cols])
X_oot_lr[numeric_cols] = scaler.transform(X_oot_lr[numeric_cols])

print("  âœ“ Features scaled (mean=0, std=1)")
print(f"  âœ“ Logistic Regression data ready: {X_train_lr.shape}")


[STEP 4] Scaling numeric features with StandardScaler...
  Scaling 9 numeric features: ['tenure_days_at_snapshot', 'sum_secs_w30', 'active_days_w30', 'complete_rate_w30', 'sum_secs_w7', 'engagement_ratio_7_30', 'days_since_last_play', 'trend_secs_w30', 'auto_renew_share']
  âœ“ Features scaled (mean=0, std=1)
  âœ“ Logistic Regression data ready: (1430517, 37)


### Step 5: Prepare Data for Tree-Based Models (Original Encoding, No Scaling)

print("\n[STEP 5] Preparing data for tree-based models (XGBoost, Random Forest)...")

# For tree-based models, use the data after missing value handling but before one-hot encoding
# Tree models can handle label-encoded categoricals and don't need scaling
X_train_tree = X_train.copy()
X_val_tree = X_val.copy()
X_test_tree = X_test.copy()
X_oot_tree = X_oot.copy()

# Ensure categorical columns are integer type (safe now after fillna)
for col in categorical_cols:
    X_train_tree[col] = X_train_tree[col].astype(int)
    X_val_tree[col] = X_val_tree[col].astype(int)
    X_test_tree[col] = X_test_tree[col].astype(int)
    X_oot_tree[col] = X_oot_tree[col].astype(int)

print(f"  âœ“ Tree-based model data ready: {X_train_tree.shape}")
print(f"  âœ“ No scaling applied (tree models don't need it)")

## Entry Point

In [10]:
snapshot_date = "2016-05-01"

# Choose model from MLflow registry
model_uri = "models:/LogisticRegression/1"   # or /XGBoost/1, /RandomForest/1

results = run_inference(snapshot_date, model_uri)
results.head()


ðŸ“‚ Loading feature store: /app/datamart/gold/feature_store/


                                                                                

âœ… Loaded features: 26864 rows for snapshot_date=2016-05-01


                                                                                

ðŸ“¦ Loading MLflow model: models:/LogisticRegression/1
âœ… Model loaded successfully!


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- city_clean
- registered_via
Feature names seen at fit time, yet now missing:
- city_clean_1.0
- city_clean_10.0
- city_clean_11.0
- city_clean_12.0
- city_clean_13.0
- ...
