<a href="https://colab.research.google.com/github/danai-coder/Customer-Churn-Analysis-Model/blob/ML_Comparisons/ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from  sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore")

In [3]:
#Load Data
df = pd.read_csv('/content/cleaned_customer_churn_dataset.csv')

# Features and target
y = df[['Churn']]
x = df[['Age', 'Gender', 'Tenure', 'Payment Delay', 'Subscription Type', 'Contract Length']]

# Drop rows with missing values in the target variable from both x and y
df.dropna(subset=['Churn'], inplace=True)
y = df[['Churn']]
x = df[['Age', 'Gender', 'Tenure', 'Payment Delay', 'Subscription Type', 'Contract Length']]

In [4]:
# Encode categorical features
x['Gender'] = x['Gender'].apply(lambda v: 1 if v == 'Male' else 0)
x['Subscription Type'] = x['Subscription Type'].apply(lambda v: 1 if v == 'Basic' else (2 if v == 'Premium' else 0))
x['Contract Length'] = x['Contract Length'].apply(lambda v: 1 if v == 'Monthly' else (2 if v == 'Yearly' else 0))

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [5]:
# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)  # Learn scale from training data and transform it
x_test = scaler.transform(x_test)       # Apply the same scale to test data


In [16]:
# Dictionary to store results
results_accuracy = {}
results_precision = {}
results_recall = {}
results_f1 = {}


# Logistic Regression
log_model = LogisticRegression()
log_model.fit(x_train, y_train)
predictions = log_model.predict(x_test)
results_accuracy["Logistic Regression"] = accuracy_score(y_test, predictions)
results_precision["Logistic Regression"] = precision_score(y_test, predictions)
results_recall["Logistic Regression"] = recall_score(y_test, predictions)
results_f1["Logistic Regression"] = f1_score(y_test, predictions)

In [17]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
results_accuracy["Random Forest"] = accuracy_score(y_test, predictions)
results_precision["Random Forest"] = precision_score(y_test, predictions)
results_recall["Random Forest"] = recall_score(y_test, predictions)
results_f1["Random Forest"] = f1_score(y_test, predictions)


In [8]:
#SVC
# SVM with GridSearchCV
#param_grid_svm = {"C": [0.1, 1, 10, 100], "kernel": ["linear", "rbf", "poly"]}
#grd_svm = GridSearchCV(SVC(), param_grid_svm, cv=5)
#grd_svm = SVC()
#grd_svm.fit(x_train, y_train)
#predictions = grd_svm.predict(x_test)
#results["SVM"] = accuracy_score(y_test, predictions)
#results["SVM (Best params: {})".format(grd_svm.best_params_)] = accuracy_score(y_test, predictions)


In [9]:
# KNN with GridSearchCV
# Define the parameter grid for KNN
#param_grid_knn = {"n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"]}
# Create GridSearchCV object
#grd_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
# Fit the model on the training data
#grd_knn.fit(x_train, y_train.values.ravel())  # .ravel() ensures y is 1D
# Make predictions on the test set
#predictions = grd_knn.predict(x_test)
# Store accuracy with a clear label in the results dictionary
#results[f"KNN (Best params: {grd_knn.best_params_})"] = accuracy_score(y_test, predictions)

In [18]:
Dt = DecisionTreeClassifier()
Dt.fit(x_train, y_train)
predictions = Dt.predict(x_test)
results_accuracy["Decision Tree"] = accuracy_score(y_test, predictions)
results_precision["Decision Tree"] = precision_score(y_test, predictions)
results_recall["Decision Tree"] = recall_score(y_test, predictions)
results_f1["Decision Tree"] = f1_score(y_test, predictions)

In [19]:
Gb = GradientBoostingClassifier()
Gb.fit(x_train, y_train)
predictions = Gb.predict(x_test)
results_accuracy["Gradient Boosting"] = accuracy_score(y_test, predictions)
results_precision["Gradient Boosting"] = precision_score(y_test, predictions)
results_recall["Gradient Boosting"] = recall_score(y_test, predictions)
results_f1["Gradient Boosting"] = f1_score(y_test, predictions)


In [20]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# 2. Fast Linear SVM
linear_svm = LinearSVC(max_iter=10000, random_state=42)
linear_svm.fit(x_train_scaled, y_train)
pred_svm = linear_svm.predict(x_test_scaled)
results_accuracy["LinearSVM"] = accuracy_score(y_test, pred_svm)
results_precision["LinearSVM"] = precision_score(y_test, pred_svm)
results_recall["LinearSVM"] = recall_score(y_test, pred_svm)
results_f1["LinearSVM"] = f1_score(y_test, pred_svm)


In [21]:
# 3. Fast KNN with KD-tree
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', n_jobs=-1)  # n_jobs=-1 uses all CPU cores
knn.fit(x_train_scaled, y_train)
pred_knn = knn.predict(x_test_scaled)
results_accuracy["KNN"] = accuracy_score(y_test, pred_knn)
results_precision["KNN"] = precision_score(y_test, pred_knn)
results_recall["KNN"] = recall_score(y_test, pred_knn)
results_f1["KNN"] = f1_score(y_test, pred_knn)

In [22]:
# Display results
display(results_accuracy)
display(results_precision)
display(results_recall)
display(results_f1)

{'Logistic Regression': 0.7485340320074404,
 'Random Forest': 0.8065035670942643,
 'Decision Tree': 0.7986321412773487,
 'Gradient Boosting': 0.8461442489820454,
 'LinearSVM': 0.7492599271836401,
 'KNN': 0.8155659146846326}

{'Logistic Regression': 0.8013203598627862,
 'Random Forest': 0.8575880876114622,
 'Decision Tree': 0.8413167334880777,
 'Gradient Boosting': 0.9781465099845587,
 'LinearSVM': 0.8042027946193798,
 'KNN': 0.8891162876261871}

{'Logistic Regression': 0.7412980999520996,
 'Random Forest': 0.7908350630688169,
 'Decision Tree': 0.7957448507105221,
 'Gradient Boosting': 0.7459284687849274,
 'LinearSVM': 0.7386037042950663,
 'KNN': 0.7716948746607057}

{'Logistic Regression': 0.770141516769478,
 'Random Forest': 0.822859990862649,
 'Decision Tree': 0.8178964859375961,
 'Gradient Boosting': 0.8463986049618969,
 'LinearSVM': 0.7700086349496988,
 'KNN': 0.8262546612387943}

A data frame showing the comparison between different ML models

In [33]:
results_df = pd.DataFrame({"Accuracy": results_accuracy, "Precision": results_precision, "Recall": results_recall, "F1 Score": results_f1})
results_df.T.round(2)


Unnamed: 0,Logistic Regression,Random Forest,Decision Tree,Gradient Boosting,LinearSVM,KNN
Accuracy,0.75,0.81,0.8,0.85,0.75,0.82
Precision,0.8,0.86,0.84,0.98,0.8,0.89
Recall,0.74,0.79,0.8,0.75,0.74,0.77
F1 Score,0.77,0.82,0.82,0.85,0.77,0.83
