In [1]:
import os
import mlflow
import joblib
from sklearn.metrics import roc_auc_score, classification_report, recall_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Set up MLflow
mlflow.set_experiment("Telco Churn Multi-Model")
mlflow.set_tracking_uri("http://127.0.0.1:5000")


# utility function

In [2]:
def preprocess_data(file_path, output_dir = 'preprocessors'):
    """
    this function preprocess the data by taking in the original file path
    also save the scaler/encoder artifacts for future pipeline

    parameters:
    'file_path' : input the original file path of data
    'output_dir' : target directory for preprocessing artifacts

    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df = pd.read_csv(file_path)
    
    # Fill missing in TotalCharges
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Ensure numeric
    df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])
    
    categorical_variables = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                             'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                             'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                             'Contract', 'PaperlessBilling', 'PaymentMethod']
    numeric_variables = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    variables = df[categorical_variables + numeric_variables]
    target = df['Churn']
    
    # Train-val-test split
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        variables, target, test_size=0.2, random_state=42, stratify=target
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
    )
    
    # Label encode target
    le = LabelEncoder()
    y_train_en = le.fit_transform(y_train)
    y_val_en = le.transform(y_val)
    y_test_en = le.transform(y_test)
    
    # One-hot encode categoricals
    ohe = OneHotEncoder(handle_unknown='ignore', drop='first')

    # Save fitted OHE immediately
    joblib.dump(ohe, os.path.join(output_dir, 'ohe.joblib'))
    print(f'OneHotEncoder saved to {output_dir}/ohe.joblib')

    X_train_ohe = pd.DataFrame(
        ohe.fit_transform(X_train[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_train.index
    )
    X_val_ohe = pd.DataFrame(
        ohe.transform(X_val[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_val.index
    )
    X_test_ohe = pd.DataFrame(
        ohe.transform(X_test[categorical_variables]).toarray(), 
        columns=ohe.get_feature_names_out(), index=X_test.index
    )
    
    # Combine numeric and encoded
    X_train = pd.concat([X_train[numeric_variables], X_train_ohe], axis=1)
    X_val = pd.concat([X_val[numeric_variables], X_val_ohe], axis=1)
    X_test = pd.concat([X_test[numeric_variables], X_test_ohe], axis=1)
    
    # Scale numerics
    scaler = StandardScaler()

    X_train[numeric_variables] = scaler.fit_transform(X_train[numeric_variables])

    # save the fitted scaler
    joblib.dump(scaler, os.path.join(output_dir, 'scaler.joblib'))
    print(f'StandardScaler saved to {output_dir}/scaler.joblib')

    # Transform val and test dataset
    X_val[numeric_variables] = scaler.transform(X_val[numeric_variables])
    X_test[numeric_variables] = scaler.transform(X_test[numeric_variables])
    
    # Imbalance handling: Compute scale_pos_weight
    scale_pos_weight = sum(y_train_en == 0) / sum(y_train_en == 1)
    
    # Log data info as artifact
    with open("data_info.txt", "w") as f:
        f.write(f"Dataset shape: {df.shape}\n")
        f.write(f"Class balance: {np.bincount(y_train_en)}\n")
        f.write(f"Scale pos weight: {scale_pos_weight}\n")
    
    return (X_train, X_val, X_test, y_train_en, y_val_en, y_test_en, 
            categorical_variables, numeric_variables, ohe, scaler, le, scale_pos_weight)



In [3]:
def train_and_log_model(model_class, model_name, param_grid, X_train, y_train_en, X_val, y_val_en, X_test, y_test_en, 
                        num_vars, scale_pos_weight, ohe, scaler, le):
    with mlflow.start_run(run_name=model_name):
        # Log preprocessing params (shared across models)
        mlflow.log_param("scale_pos_weight", scale_pos_weight)
        mlflow.log_param("num_features", X_train.shape[1])
        mlflow.log_param("cat_vars_count", len(cat_vars))
        mlflow.log_param("num_vars_count", len(num_vars))
        
        # Initialize base model
        if model_class == xgb.XGBClassifier:
            base_model = model_class(objective='binary:logistic', scale_pos_weight=scale_pos_weight, 
                                     random_state=42, eval_metric='aucpr')
        else:
            base_model = model_class(random_state=42)
        
        # Grid search
        grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train_en)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        # Log hyperparams
        for param, value in best_params.items():
            mlflow.log_param(f"{model_name}_{param}", value)
        
        # Predictions and metrics
        y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
        y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
        y_test_pred = best_model.predict(X_test)
        
        val_auc = roc_auc_score(y_val_en, y_val_pred_proba)
        test_auc = roc_auc_score(y_test_en, y_test_pred_proba)
        
        # Compute recall and precision for positive class (churn = 1)
        test_recall = recall_score(y_test_en, y_test_pred, pos_label=1)
        test_precision = precision_score(y_test_en, y_test_pred, pos_label=1)
        
        mlflow.log_metric("val_auc_roc", val_auc)
        mlflow.log_metric("test_auc_roc", test_auc)
        mlflow.log_metric("test_recall", test_recall)
        mlflow.log_metric("test_precision", test_precision)
        
        # Log classification report as artifact
        report = classification_report(y_test_en, y_test_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv("classification_report.csv")
        mlflow.log_artifact("classification_report.csv")
        
        # Log model
        if model_class == xgb.XGBClassifier:
            mlflow.xgboost.log_model(best_model, "model")
        else:
            mlflow.sklearn.log_model(best_model, "model")
        
        # Log model details
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("target_encoder_classes", list(le.classes_))
        
        print(f"{model_name} Test AUC-ROC: {test_auc:.3f}, Recall: {test_recall:.3f}, Precision: {test_precision:.3f}")
        return best_model, test_auc


# workflow

In [4]:
# Load data
data = preprocess_data('../data/customer_churn_telecom_services.csv')
(X_train, X_val, X_test, y_train_en, y_val_en, y_test_en, 
 cat_vars, num_vars, ohe, scaler, le, scale_pos_weight) = data


OneHotEncoder saved to preprocessors/ohe.joblib
StandardScaler saved to preprocessors/scaler.joblib


In [7]:
# Three models to be trained (XGBoost, RandomForest, LogisticRegression)

models = [
    (
        "XGBoost",
        xgb.XGBClassifier(n_estimators = 100, max_depth = 3, learning_rate = 0.1),
        (X_train, y_train_en),
        (X_test, y_test_en)
    ),
    (
        'randomforest', 
        RandomForestClassifier(n_estimators = 100, max_depth = 10),
        (X_train, y_train_en),
        (X_test, y_test_en)
    ),
    (
        'LogisticRegression',
        LogisticRegression(C = 1, penalty = 'l1', solver = 'liblinear'),
        (X_train, y_train_en),
        (X_test, y_test_en)
    )
]

In [8]:
reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train_en = train_set[1]
    X_test = test_set[0]
    y_test_en = test_set[1]

    model.fit(X_train, y_train_en)
    y_pred = model.predict(X_test)
    report = classification_report(y_test_en, y_pred, output_dict=True)
    reports.append(report)

In [9]:
report

{'0': {'precision': 0.8472095150960659,
  'recall': 0.8946859903381642,
  'f1-score': 0.8703007518796992,
  'support': 1035.0},
 '1': {'precision': 0.6550632911392406,
  'recall': 0.553475935828877,
  'f1-score': 0.6,
  'support': 374.0},
 'accuracy': 0.8041163946061036,
 'macro avg': {'precision': 0.7511364031176533,
  'recall': 0.7240809630835205,
  'f1-score': 0.7351503759398497,
  'support': 1409.0},
 'weighted avg': {'precision': 0.7962068978073131,
  'recall': 0.8041163946061036,
  'f1-score': 0.7985530718207868,
  'support': 1409.0}}

In [12]:
# Initialize MLflow
mlflow.set_experiment("Churn detection multi model")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

2025/12/01 11:04:45 INFO mlflow.tracking.fluent: Experiment with name 'Churn detection multi model' does not exist. Creating a new experiment.


In [13]:
for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]

    with mlflow.start_run(run_name = model_name):
        mlflow.log_param("model", model_name)
        mlflow.log_params(model.get_params())
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) 

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, f'{model_name} model')
        else:
            mlflow.sklearn.log_model(model, f'{model_name} model')

  self.get_booster().save_model(fname)


üèÉ View run XGBoost at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/9b2614061b524922be7f40c82b8df46a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641




üèÉ View run randomforest at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/886339c1e25346209bf46181230a08ac
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641




üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/bdfbbbc70dda41a7ab8dc81667f40eec
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641


# Gridsearch for best model and log with mlflow

In [25]:
# Load data
data = preprocess_data('../data/customer_churn_telecom_services.csv')
(X_train, X_val, X_test, y_train_en, y_val_en, y_test_en, 
 cat_vars, num_vars, ohe, scaler, le, scale_pos_weight) = data

In [29]:
# Example param grids for different models
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01], 'min_child_weight':[1, 3], 'scale_pos_weight': [scale_pos_weight, scale_pos_weight*1.5, scale_pos_weight*2]}
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_leaf':[1,2,4], 'class_weight':['balanced', 'balanced_subsample']}
lr_params = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear'], 'class_weight':['balanced', None]}


In [32]:
# Train models with GridSearch
models_dict = {
    "XGBoost": (xgb.XGBClassifier, xgb_params),
    "RandomForest": (RandomForestClassifier, rf_params),
    "LogisticRegression": (LogisticRegression, lr_params)
}

for model_name, (model_class, param_grid) in models_dict.items():
    with mlflow.start_run(run_name=f"{model_name}_GridSearch"):
        
        # Initialize model
        if model_class == xgb.XGBClassifier:
            base_model = model_class(objective='binary:logistic', scale_pos_weight=scale_pos_weight,
                                     random_state=42, eval_metric='aucpr')
        else:
            base_model = model_class(random_state=42)

        grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='roc_auc', n_jobs=1)
        
        print(f"Training {model_name}...") # Add print to confirm code is running
        grid_search.fit(X_train, y_train_en)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        # Log params
        for param, value in best_params.items():
            mlflow.log_param(f'{model_name}_{param}', value)

        # Predictions and metrics
        y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
        y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
        y_test_pred = best_model.predict(X_test)

        report = classification_report(y_test_en, y_test_pred, output_dict=True)

        print(report)

        mlflow.log_param("model", model_name)
        mlflow.log_params(model.get_params())
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) 
            
        # Log model
        if model_class == xgb.XGBClassifier:
            mlflow.xgboost.log_model(best_model, 'model')
        else:
            mlflow.sklearn.log_model(best_model, 'model')
        

print("Grid search training complete.")


Training XGBoost...
{'0': {'precision': 0.938622754491018, 'recall': 0.6057971014492753, 'f1-score': 0.7363476218438051, 'support': 1035.0}, '1': {'precision': 0.4493927125506073, 'recall': 0.8903743315508021, 'f1-score': 0.5973094170403588, 'support': 374.0}, 'accuracy': 0.681334279630944, 'macro avg': {'precision': 0.6940077335208126, 'recall': 0.7480857165000387, 'f1-score': 0.666828519442082, 'support': 1409.0}, 'weighted avg': {'precision': 0.8087632543592127, 'recall': 0.681334279630944, 'f1-score': 0.6994418102068364, 'support': 1409.0}}


  self.get_booster().save_model(fname)


üèÉ View run XGBoost_GridSearch at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/d83660fa9985453f9d21ccd8acaa0bb9
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Training RandomForest...
{'0': {'precision': 0.8955732122587968, 'recall': 0.7623188405797101, 'f1-score': 0.8235908141962421, 'support': 1035.0}, '1': {'precision': 0.5340909090909091, 'recall': 0.7540106951871658, 'f1-score': 0.6252771618625277, 'support': 374.0}, 'accuracy': 0.7601135557132718, 'macro avg': {'precision': 0.7148320606748529, 'recall': 0.758164767883438, 'f1-score': 0.7244339880293849, 'support': 1409.0}, 'weighted avg': {'precision': 0.7996226222057166, 'recall': 0.7601135557132718, 'f1-score': 0.7709511364298766, 'support': 1409.0}}




üèÉ View run RandomForest_GridSearch at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/b2eee67ffdb046eb9407ffd725aad78c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Training LogisticRegression...
{'0': {'precision': 0.844954128440367, 'recall': 0.8898550724637682, 'f1-score': 0.8668235294117647, 'support': 1035.0}, '1': {'precision': 0.6426332288401254, 'recall': 0.5481283422459893, 'f1-score': 0.5916305916305916, 'support': 374.0}, 'accuracy': 0.7991483321504613, 'macro avg': {'precision': 0.7437936786402461, 'recall': 0.7189917073548787, 'f1-score': 0.7292270605211781, 'support': 1409.0}, 'weighted avg': {'precision': 0.7912507810659949, 'recall': 0.7991483321504613, 'f1-score': 0.7937772847487706, 'support': 1409.0}}




üèÉ View run LogisticRegression_GridSearch at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/a4f906f6ea424d098f9e556406451d02
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Grid search training complete.


# Gridsearch with evaluation metric set as recall

In [None]:
# Train models with GridSearch
models_dict = {
    "XGBoost": (xgb.XGBClassifier, xgb_params),
    "RandomForest": (RandomForestClassifier, rf_params),
    "LogisticRegression": (LogisticRegression, lr_params)
}

for model_name, (model_class, param_grid) in models_dict.items():
    with mlflow.start_run(run_name=f"{model_name}_GridSearch_recall_1"):
        
        # Initialize model
        if model_class == xgb.XGBClassifier:
            base_model = model_class(objective='binary:logistic', scale_pos_weight=scale_pos_weight,
                                     random_state=42, eval_metric='aucpr')
        else:
            base_model = model_class(random_state=42)

        grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='recall', n_jobs=1)
        
        print(f"Training {model_name}...") # Add print to confirm code is running
        grid_search.fit(X_train, y_train_en)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        # Log params
        for param, value in best_params.items():
            mlflow.log_param(f'{model_name}_{param}', value)

        # Predictions and metrics
        y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
        y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
        y_test_pred = best_model.predict(X_test)

        report = classification_report(y_test_en, y_test_pred, output_dict=True)

        print(report)

        mlflow.log_param("model", model_name)
        mlflow.log_params(model.get_params())
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) 
            
        # Log model
        if model_class == xgb.XGBClassifier:
            mlflow.xgboost.log_model(best_model, 'model')
        else:
            mlflow.sklearn.log_model(best_model, 'model')
        

print("Grid search training complete.")


Training XGBoost...
{'0': {'precision': 0.9637096774193549, 'recall': 0.4618357487922705, 'f1-score': 0.6244284781188766, 'support': 1035.0}, '1': {'precision': 0.3899233296823658, 'recall': 0.9518716577540107, 'f1-score': 0.5532245532245532, 'support': 374.0}, 'accuracy': 0.5919091554293825, 'macro avg': {'precision': 0.6768165035508603, 'recall': 0.7068537032731406, 'f1-score': 0.588826515671715, 'support': 1409.0}, 'weighted avg': {'precision': 0.811405849134306, 'recall': 0.5919091554293825, 'f1-score': 0.6055283589489143, 'support': 1409.0}}


  self.get_booster().save_model(fname)


üèÉ View run XGBoost_GridSearch_recall_1 at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/3701745c7b364cedbe5a0d86df8836de
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Training RandomForest...
{'0': {'precision': 0.8910112359550562, 'recall': 0.7661835748792271, 'f1-score': 0.8238961038961039, 'support': 1035.0}, '1': {'precision': 0.5337186897880539, 'recall': 0.7406417112299465, 'f1-score': 0.620380739081747, 'support': 374.0}, 'accuracy': 0.759403832505323, 'macro avg': {'precision': 0.712364962871555, 'recall': 0.7534126430545868, 'f1-score': 0.7221384214889255, 'support': 1409.0}, 'weighted avg': {'precision': 0.7961727602513948, 'recall': 0.759403832505323, 'f1-score': 0.7698757018800859, 'support': 1409.0}}




üèÉ View run RandomForest_GridSearch_recall_1 at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/a52a5dc0000542f38799a93d4003ea38
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Training LogisticRegression...
{'0': {'precision': 0.9012048192771084, 'recall': 0.7227053140096619, 'f1-score': 0.8021447721179624, 'support': 1035.0}, '1': {'precision': 0.5043177892918825, 'recall': 0.7807486631016043, 'f1-score': 0.6128016789087093, 'support': 374.0}, 'accuracy': 0.7381121362668559, 'macro avg': {'precision': 0.7027613042844955, 'recall': 0.751726988555633, 'f1-score': 0.7074732255133358, 'support': 1409.0}, 'weighted avg': {'precision': 0.7958565231703132, 'recall': 0.7381121362668559, 'f1-score': 0.7518862079871884, 'support': 1409.0}}




üèÉ View run LogisticRegression_GridSearch_recall_1 at: http://127.0.0.1:5000/#/experiments/332465250823010641/runs/1a99e18c620b490ba04307fb157f6b3b
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/332465250823010641
Grid search training complete.
