# Import required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import joblib

# Load the dataset

In [2]:
dataset = pd.read_csv('Churn_Modelling.csv')


# List of columns to drop
columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']

# Drop the specified columns
dataset = dataset.drop(columns=columns_to_drop)




X = dataset.drop('Exited', axis=1)
y = dataset['Exited']

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [4]:
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,delhi,Female,42,2,0.0,1,1,1,101348.88,1
1,608,bangalore,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,delhi,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,delhi,Female,39,1,0.0,2,0,0,93826.63,0
4,850,bangalore,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,delhi,Female,42,2,0.00,1,1,1,101348.88
1,608,bangalore,Female,41,1,83807.86,1,0,1,112542.58
2,502,delhi,Female,42,8,159660.80,3,1,0,113931.57
3,699,delhi,Female,39,1,0.00,2,0,0,93826.63
4,850,bangalore,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,delhi,Male,39,5,0.00,2,1,0,96270.64
9996,516,delhi,Male,35,10,57369.61,1,1,1,101699.77
9997,709,delhi,Female,36,7,0.00,1,0,1,42085.58
9998,772,mumbai,Male,42,3,75075.31,2,1,0,92888.52


# Define preprocessing steps

In [6]:
numerical_features = ['CreditScore', 'Age','Tenure','Balance','NumOfProducts',
                     'HasCrCard','IsActiveMember','EstimatedSalary']  
categorical_features = ['Geography', 'Gender']  

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [7]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,delhi,Female,42,2,0.00,1,1,1,101348.88
1,608,bangalore,Female,41,1,83807.86,1,0,1,112542.58
2,502,delhi,Female,42,8,159660.80,3,1,0,113931.57
3,699,delhi,Female,39,1,0.00,2,0,0,93826.63
4,850,bangalore,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,delhi,Male,39,5,0.00,2,1,0,96270.64
9996,516,delhi,Male,35,10,57369.61,1,1,1,101699.77
9997,709,delhi,Female,36,7,0.00,1,0,1,42085.58
9998,772,mumbai,Male,42,3,75075.31,2,1,0,92888.52


# Define the classification models

In [8]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Evaluate models

In [9]:
results = {}
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    results[name] = scores
    print(f'{name}: {scores.mean():.4f} (+/- {scores.std():.4f})')


Logistic Regression: 0.8097 (+/- 0.0050)
Decision Tree: 0.7917 (+/- 0.0046)
SVC: 0.8562 (+/- 0.0062)
Random Forest: 0.8640 (+/- 0.0043)
Gradient Boosting: 0.8642 (+/- 0.0069)
XGBoost: 0.8540 (+/- 0.0040)


## Choose the best one

In [10]:
# Choose the best model
best_model_name = max(results, key=lambda k: results[k].mean())
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')


Best model: Gradient Boosting


# Train the best model on the entire training data

In [11]:
best_clf = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('classifier', best_model)])
best_clf.fit(X, y)

# Evaluate the best model on the test set

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42)
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.48      0.58       393

    accuracy                           0.86      2000
   macro avg       0.81      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000



# Save the best model using joblib

In [14]:
joblib.dump(best_clf, 'model.joblib')

['model.joblib']

In [16]:
import joblib
import pandas as pd

# Load the saved model from the joblib file
best_clf = joblib.load('model.joblib')

# Example test data, replace with your actual test data
test_data = {
    'CreditScore': [619, 608, 502],
    'Geography': ['delhi', 'banglore', 'mumbai'],
    'Gender': ['Female', 'Male', 'Female'],
    'Age': [42, 44, 43],
    'Tenure':[10,8,7],
    'Balance':[83807.86,159660.80,0.00],
    'NumOfProducts':[2,3,5],
    'HasCrCard':[0,1,1],
    'IsActiveMember':[1,1,0],
    'EstimatedSalary':[101348.88,112542.58,113931.57]
    
}
X_test = pd.DataFrame(test_data)

# Make predictions
y_pred = best_clf.predict(X_test)
print("Predictions:", y_pred)


Predictions: [0 1 1]


In [15]:
dataset

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,delhi,Female,42,2,0.00,1,1,1,101348.88,1
1,608,bangalore,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,delhi,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,delhi,Female,39,1,0.00,2,0,0,93826.63,0
4,850,bangalore,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,delhi,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,delhi,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,delhi,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,mumbai,Male,42,3,75075.31,2,1,0,92888.52,1
