### Model Training

#### Importing relevant libraries

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (
    accuracy_score, classification_report, recall_score, confusion_matrix,
    roc_auc_score, precision_score, f1_score, roc_curve, auc
)
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('Dataset/Telco Churn Data.csv')
data = df.copy()

In [3]:
# Convert TotalCharges to numeric, filling NaN values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)

In [4]:
# Check for missing values in the columns
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
df =  df.drop(columns=['customerID'])


In [6]:
# Convert SeniorCitizen to object
df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)

In [7]:
# Replace 'No phone service' and 'No internet service' with 'No' for certain columns
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
columns_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for column in columns_to_replace:
    df[column] = df[column].replace('No internet service', 'No')


In [8]:
# Convert 'Churn' categorical variable to numeric
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


### Create the StratifiedShuffleSplit object

In [10]:
X = df.drop(columns=['Churn'])
y = df['Churn']

In [11]:
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features)
               
    ]
)

In [12]:
X_scaled = preprocessor.fit_transform(X)

In [13]:
X_scaled.shape

(7043, 36)

In [14]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=64)

for train_index, test_index in sss.split(X_scaled, y):
    X_train_scaled, X_test_scaled = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [15]:
X_train_scaled.shape

(5634, 36)

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

def evaluate_model(y_true, y_pred):
    """
    Evaluate the performance of a model on churn prediction.
    
    Parameters:
    y_true (array): True labels
    y_pred (array): Predicted labels
    
    Returns:
    dict: Evaluation metrics
    """
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'Classification Report': classification_report(y_true, y_pred),
        'Confusion Matrix': confusion_matrix(y_true, y_pred)
    }
    return metrics

In [17]:
#Define different models
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
models = {
    "Logistic Regression": LogisticRegression(),
    "CatBoost": CatBoostClassifier(),
    "XGBoost": xgb.XGBClassifier()
}

In [18]:
for name, model in models.items():
    if name == "CatBoost":
        model.fit(X_train_scaled, y_train, verbose=0)
    else:
        model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    metrics = evaluate_model(y_test, y_pred)
    
    # Remove "learn" lines from metrics
    if 'Confusion Matrix' in metrics:
        metrics['Confusion Matrix'] = str(metrics['Confusion Matrix']).replace('\n', '')
    
    # Print organized output
    print(f"**{name}**")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"Classification Report:\n{metrics['Classification Report']}")
    print(f"Confusion Matrix:\n{metrics['Confusion Matrix']}\n")

**Logistic Regression**
Accuracy: 0.8112
Precision: 0.6875
Recall: 0.5294
F1 Score: 0.5982
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1035
           1       0.69      0.53      0.60       374

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
[[945  90] [176 198]]

**CatBoost**
Accuracy: 0.7828
Precision: 0.6126
Recall: 0.4947
F1 Score: 0.5473
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.61      0.49      0.55       374

    accuracy                           0.78      1409
   macro avg       0.72      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409

Confusion Matrix:
[[918 117] [189 185]]

**XGBoost**
Accuracy: 0.7686
Precision: 0.5741


In [19]:
import pickle


In [21]:
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(LogisticRegression, file)

print("Model saved successfully!")

Model saved successfully!
