### Importing libraries


In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Evaluation Metrics


In [22]:
# Evaluation
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

### Loading the Preprocessed Data

In [23]:
# Load feature and label datasets
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Flatten y arrays to avoid shape warnings in sklearn
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Training Logistic Regression

In [24]:
# Convert labels to integers if they are float
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Fit the model again
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)


LogisticRegression(max_iter=1000, random_state=42)

### Evaluating Logistic Regression

In [25]:
# Predict churn for the test set
y_pred_lr = log_model.predict(X_test)
y_proba_lr = log_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC AUC

# Print performance metrics
print("🔍 Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba_lr))


🔍 Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.65      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.74      1409
weighted avg       0.80      0.80      0.80      1409

Confusion Matrix:
 [[925 110]
 [166 208]]
ROC AUC Score: 0.8416905629181844


### Training Random Forest Model

In [29]:
# Train Random Forest
rnfr_model = RandomForestClassifier(n_estimators=100, random_state=42)
rnfr_model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

### Training XG Boost Model

In [27]:
# Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

### Evaluation Function for All Models

In [30]:
# Evaluation function to compare all models
def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    return {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba)
    }


In [31]:
# Compare all models
results = []

for name, model in [
    ('Logistic Regression', log_model),
    ('Random Forest', rnfr_model),
    ('XGBoost', xgb_model)
]:
    results.append(evaluate_model(name, model))

# Create DataFrame to display results
results_df = pd.DataFrame(results)
results_df.sort_values(by='ROC AUC', ascending=False, inplace=True)

results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Logistic Regression,0.804116,0.654088,0.55615,0.601156,0.841691
1,Random Forest,0.788502,0.628378,0.497326,0.555224,0.82522
2,XGBoost,0.786373,0.618893,0.508021,0.558003,0.824089


### Saving the Best Model

Saving the best model among the models, will help later when we have to use if for deployment. 

In [32]:
import joblib

# Save XGBoost as best model
joblib.dump(xgb_model, 'best_model_xgb.pkl')


['best_model_xgb.pkl']