<a href="https://colab.research.google.com/github/danai-coder/Customer-Churn-Analysis-Model/blob/ML_Comparisons/ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from  sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore")

In [3]:
#Load Data
df = pd.read_csv('/content/cleaned_customer_churn_dataset.csv')

# Features and target
y = df[['Churn']]
x = df[['Age', 'Gender', 'Tenure', 'Payment Delay', 'Subscription Type', 'Contract Length']]

# Drop rows with missing values in the target variable from both x and y
df.dropna(subset=['Churn'], inplace=True)
y = df[['Churn']]
x = df[['Age', 'Gender', 'Tenure', 'Payment Delay', 'Subscription Type', 'Contract Length']]

In [4]:
# Encode categorical features
x['Gender'] = x['Gender'].apply(lambda v: 1 if v == 'Male' else 0)
x['Subscription Type'] = x['Subscription Type'].apply(lambda v: 1 if v == 'Basic' else (2 if v == 'Premium' else 0))
x['Contract Length'] = x['Contract Length'].apply(lambda v: 1 if v == 'Monthly' else (2 if v == 'Yearly' else 0))

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [5]:
# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)  # Learn scale from training data and transform it
x_test = scaler.transform(x_test)       # Apply the same scale to test data


In [6]:
# Dictionary to store results
results = {}


# Logistic Regression
log_model = LogisticRegression()
log_model.fit(x_train, y_train)
predictions = log_model.predict(x_test)
results["Logistic Regression"] = accuracy_score(y_test, predictions)

In [7]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
results["Random Forest"] = accuracy_score(y_test, predictions)


In [None]:
#SVC
# SVM with GridSearchCV
#param_grid_svm = {"C": [0.1, 1, 10, 100], "kernel": ["linear", "rbf", "poly"]}
#grd_svm = GridSearchCV(SVC(), param_grid_svm, cv=5)
#grd_svm = SVC()
#grd_svm.fit(x_train, y_train)
#predictions = grd_svm.predict(x_test)
#results["SVM"] = accuracy_score(y_test, predictions)
#results["SVM (Best params: {})".format(grd_svm.best_params_)] = accuracy_score(y_test, predictions)


In [None]:
# KNN with GridSearchCV
# Define the parameter grid for KNN
#param_grid_knn = {"n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"]}
# Create GridSearchCV object
#grd_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
# Fit the model on the training data
#grd_knn.fit(x_train, y_train.values.ravel())  # .ravel() ensures y is 1D
# Make predictions on the test set
#predictions = grd_knn.predict(x_test)
# Store accuracy with a clear label in the results dictionary
#results[f"KNN (Best params: {grd_knn.best_params_})"] = accuracy_score(y_test, predictions)

In [8]:
Dt = DecisionTreeClassifier()
Dt.fit(x_train, y_train)
predictions = Dt.predict(x_test)
results["Decision Tree"] = accuracy_score(y_test, predictions)

In [9]:
Gb = GradientBoostingClassifier()
Gb.fit(x_train, y_train)
predictions = Gb.predict(x_test)
results["Gradient Boosting"] = accuracy_score(y_test, predictions)

In [10]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# 2. Fast Linear SVM
linear_svm = LinearSVC(max_iter=10000, random_state=42)
linear_svm.fit(x_train_scaled, y_train)
pred_svm = linear_svm.predict(x_test_scaled)
results["LinearSVM"] = accuracy_score(y_test, pred_svm)


In [11]:
# 3. Fast KNN with KD-tree
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', n_jobs=-1)  # n_jobs=-1 uses all CPU cores
knn.fit(x_train_scaled, y_train)
pred_knn = knn.predict(x_test_scaled)
results["KNN"] = accuracy_score(y_test, pred_knn)

In [12]:
# Display results
display(results)

{'Logistic Regression': 0.7485340320074404,
 'Random Forest': 0.8067530935610829,
 'Decision Tree': 0.7983712726984019,
 'Gradient Boosting': 0.8461442489820454,
 'LinearSVM': 0.7492599271836401,
 'KNN': 0.8155659146846326}

In [13]:
results= pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy
3,Gradient Boosting,0.846144
5,KNN,0.815566
1,Random Forest,0.806753
2,Decision Tree,0.798371
4,LinearSVM,0.74926
0,Logistic Regression,0.748534
