# Import libraries

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Load the dataset

In [5]:
df = pd.read_csv('../Churn_Modelling.csv')

In [7]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Data Preprocessing
Drop columns that are not predictive

In [11]:
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Encode categorical variables: Geography and Gender

In [14]:
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

# Split dataset into features and target variable

In [17]:
X = df.drop('Exited', axis=1)
y = df['Exited']

# Split into training and testing sets

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (especially important for SVM and KNN)

In [23]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Define a dictionary of models to compare

In [26]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}


# Function to evaluate models

In [29]:
def evaluate_model(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    predictions = model.predict(X_te)
    acc = accuracy_score(y_te, predictions)
    prec = precision_score(y_te, predictions)
    rec = recall_score(y_te, predictions)
    f1 = f1_score(y_te, predictions)
    return acc, prec, rec, f1

# Evaluate each model and store the results

In [32]:
results = {}
for name, model in models.items():
    # Use scaled data for SVM and KNN, for others raw scaled data may also help.
    if name in ['SVM', 'KNN']:
        X_tr_used, X_te_used = X_train_scaled, X_test_scaled
    else:
        X_tr_used, X_te_used = X_train_scaled, X_test_scaled
    acc, prec, rec, f1 = evaluate_model(model, X_tr_used, X_te_used, y_train, y_test)
    results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1}

# Display the results

In [35]:
results_df = pd.DataFrame(results).T
print("Model Performance Comparison:")
print(results_df)

Model Performance Comparison:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression    0.8110   0.552448  0.201018  0.294776
Decision Tree          0.7810   0.448980  0.503817  0.474820
Random Forest          0.8665   0.762500  0.465649  0.578199
Gradient Boosting      0.8675   0.750000  0.488550  0.591680
SVM                    0.8560   0.769231  0.381679  0.510204
KNN                    0.8300   0.610879  0.371501  0.462025
Naive Bayes            0.8335   0.635135  0.358779  0.458537


# Choose the best model based on F1 Score (you can choose another metric if desired)

In [40]:
best_model_name = results_df['F1 Score'].idxmax()
print(f"\nBest Model: {best_model_name}")



Best Model: Gradient Boosting


# Retrain the best model on the entire training set (using scaled features)

In [43]:
best_model = models[best_model_name]
best_model.fit(X_train_scaled, y_train)

# Save the scaler and best model into files (so they can be used during prediction)

In [46]:
with open('../model/best_model.pkl', 'wb') as f:
    pickle.dump({'model': best_model, 'scaler': scaler}, f)

print("Best model saved successfully!")

Best model saved successfully!
