In [14]:
import mlflow
import mlflow.xgboost
import mlflow.sklearn
import mlflow.shap  # For SHAP logging
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Set up MLflow
mlflow.set_experiment("Telco Churn Multi-Model")
mlflow.set_tracking_uri("http://127.0.0.1:5001")


2025/10/21 15:56:33 INFO mlflow.tracking.fluent: Experiment with name 'Telco Churn Multi-Model' does not exist. Creating a new experiment.


# utility function

In [15]:
def preprocess_data(file_path):
    """
    this function preprocess the data by taking in the original file path
    """

    df = pd.read_csv(file_path)
    
    # Fill missing in TotalCharges
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Ensure numeric
    df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])
    
    categorical_variables = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                             'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                             'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                             'Contract', 'PaperlessBilling', 'PaymentMethod']
    numeric_variables = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    variables = df[categorical_variables + numeric_variables]
    target = df['Churn']
    
    # Train-val-test split
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        variables, target, test_size=0.2, random_state=42, stratify=target
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
    )
    
    # Label encode target
    le = LabelEncoder()
    y_train_en = le.fit_transform(y_train)
    y_val_en = le.transform(y_val)
    y_test_en = le.transform(y_test)
    
    # One-hot encode categoricals
    ohe = OneHotEncoder(handle_unknown='ignore', drop='first')
    X_train_ohe = pd.DataFrame(
        ohe.fit_transform(X_train[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_train.index
    )
    X_val_ohe = pd.DataFrame(
        ohe.transform(X_val[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_val.index
    )
    X_test_ohe = pd.DataFrame(
        ohe.transform(X_test[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_test.index
    )
    
    # Combine numeric and encoded
    X_train = pd.concat([X_train[numeric_variables], X_train_ohe], axis=1)
    X_val = pd.concat([X_val[numeric_variables], X_val_ohe], axis=1)
    X_test = pd.concat([X_test[numeric_variables], X_test_ohe], axis=1)
    
    # Scale numerics
    scaler = StandardScaler()
    X_train[numeric_variables] = scaler.fit_transform(X_train[numeric_variables])
    X_val[numeric_variables] = scaler.transform(X_val[numeric_variables])
    X_test[numeric_variables] = scaler.transform(X_test[numeric_variables])
    
    # Imbalance handling: Compute scale_pos_weight
    scale_pos_weight = sum(y_train_en == 0) / sum(y_train_en == 1)
    
    # Log data info as artifact
    with open("data_info.txt", "w") as f:
        f.write(f"Dataset shape: {df.shape}\n")
        f.write(f"Class balance: {np.bincount(y_train_en)}\n")
        f.write(f"Scale pos weight: {scale_pos_weight}\n")
    
    return (X_train, X_val, X_test, y_train_en, y_val_en, y_test_en, 
            categorical_variables, numeric_variables, ohe, scaler, le, scale_pos_weight)



In [16]:
def train_and_log_model(model_class, model_name, param_grid, X_train, y_train_en, X_val, y_val_en, X_test, y_test_en, 
                        num_vars, scale_pos_weight, ohe, scaler, le):
    with mlflow.start_run(run_name=model_name):
        # Log preprocessing params (shared across models)
        mlflow.log_param("scale_pos_weight", scale_pos_weight)
        mlflow.log_param("num_features", X_train.shape[1])
        mlflow.log_param("cat_vars_count", len(cat_vars))
        mlflow.log_param("num_vars_count", len(num_vars))
        
        # Initialize base model
        if model_class == xgb.XGBClassifier:
            base_model = model_class(objective='binary:logistic', scale_pos_weight=scale_pos_weight, 
                                     random_state=42, eval_metric='aucpr')
        else:
            base_model = model_class(random_state=42)
        
        # Grid search
        grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train_en)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        # Log hyperparams
        for param, value in best_params.items():
            mlflow.log_param(f"{model_name}_{param}", value)
        
        # Predictions and metrics
        y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
        y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
        y_test_pred = best_model.predict(X_test)
        
        val_auc = roc_auc_score(y_val_en, y_val_pred_proba)
        test_auc = roc_auc_score(y_test_en, y_test_pred_proba)
        
        mlflow.log_metric("val_auc_roc", val_auc)
        mlflow.log_metric("test_auc_roc", test_auc)
        
        # Log classification report as artifact
        report = classification_report(y_test_en, y_test_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv("classification_report.csv")
        mlflow.log_artifact("classification_report.csv")
        
        # Log model
        if model_class == xgb.XGBClassifier:
            mlflow.xgboost.log_model(best_model, "model")
        else:
            mlflow.sklearn.log_model(best_model, "model")
        
        # Log model details
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("target_encoder_classes", list(le.classes_))
        
        print(f"{model_name} Test AUC-ROC: {test_auc:.3f}")
        return best_model, test_auc


# workflow

In [17]:
# Load data
data = preprocess_data('../data/customer_churn_telecom_services.csv')
(X_train, X_val, X_test, y_train_en, y_val_en, y_test_en, 
 cat_vars, num_vars, ohe, scaler, le, scale_pos_weight) = data


In [18]:
# Your existing param grids (unchanged)
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.1, 0.01]}
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
lr_params = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}

# Your models dictionary (unchanged)
models_dict = {
    "XGBoost": (xgb.XGBClassifier, xgb_params),
    "RandomForest": (RandomForestClassifier, rf_params),
    "LogisticRegression": (LogisticRegression, lr_params)
}

# Train models (now SHAP-free)
results = {}
for model_name, (model_class, param_grid) in models_dict.items():
    model, auc = train_and_log_model(model_class, model_name, param_grid, 
                                     X_train, y_train_en, X_val, y_val_en, 
                                     X_test, y_test_en, num_vars, scale_pos_weight, 
                                     ohe, scaler, le)
    results[model_name] = auc

print("Training complete. Results:", results)


  self.get_booster().save_model(fname)


XGBoost Test AUC-ROC: 0.839
🏃 View run XGBoost at: http://127.0.0.1:5001/#/experiments/387869399784757540/runs/c203d34ec0cf4acc871184228ad0b414
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/387869399784757540




RandomForest Test AUC-ROC: 0.840
🏃 View run RandomForest at: http://127.0.0.1:5001/#/experiments/387869399784757540/runs/55f63e5f75174e68a77fad42663d5637
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/387869399784757540




LogisticRegression Test AUC-ROC: 0.843
🏃 View run LogisticRegression at: http://127.0.0.1:5001/#/experiments/387869399784757540/runs/4d52b09cb8284993a449f887728cad5a
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/387869399784757540
Training complete. Results: {'XGBoost': 0.8385414244749283, 'RandomForest': 0.8396832777906946, 'LogisticRegression': 0.8426438296003514}
