# User Churn Prediction System - Modelling

In [26]:
# Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


df = pd.read_csv("data/Telco_churn_after_DC.csv")
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

df.head()

X = df.drop("Churn", axis=1)
y = df["Churn"]

cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()


# preprocessing pipelines for both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[

        # Numerical pipeline
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        # Categorical pipeline
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols)

    ]
)


#train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)


#model defining and evaluation
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True)
}

results = {}

print("\n MODEL PERFORMANCE \n")

for name, model in models.items():

    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    acc = accuracy_score(y_test, preds)
    results[name] = acc

    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))
    print("-"*60)



print("\nFINAL ACCURACY COMPARISON\n")
for name, score in results.items():
    print(f"{name}: {score:.4f}")



 MODEL PERFORMANCE 

Model: Logistic Regression
Accuracy: 0.8219
              precision    recall  f1-score   support

          No       0.86      0.90      0.88      1036
         Yes       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

------------------------------------------------------------
Model: Random Forest
Accuracy: 0.7935
              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1036
         Yes       0.66      0.45      0.53       373

    accuracy                           0.79      1409
   macro avg       0.74      0.68      0.70      1409
weighted avg       0.78      0.79      0.78      1409

------------------------------------------------------------
Model: Gradient Boosting
Accuracy: 0.8155
              precision    recall  f1-score   support

          No       0.84