# Customer Churn Prediction
This notebook covers the task of predicting customer churn for a subscription-based service.
We will use the Telco Customer Churn dataset.
The notebook includes dataset download, preprocessing, model training, and evaluation.

In [None]:
# Install necessary libraries
!pip install -q pandas scikit-learn numpy


In [2]:
!pip install -q kaggle
import os
import shutil

# Create .kaggle directory
kaggle_dir = os.path.join(os.path.expanduser('~'), '.kaggle')
os.makedirs(kaggle_dir, exist_ok=True)

# Assuming kaggle.json is uploaded to the current directory
# Copy kaggle.json to .kaggle directory
kaggle_json_path = 'kaggle.json'
destination_path = os.path.join(kaggle_dir, kaggle_json_path)
if os.path.exists(kaggle_json_path):
    shutil.copy(kaggle_json_path, destination_path)
    # Set permissions (readable only by the owner)
    os.chmod(destination_path, 0o600)
    print("Kaggle API key configured successfully.")
else:
    print("kaggle.json not found. Please upload it.")


Kaggle API key configured successfully.


In [8]:
import os
import zipfile

# Step 1: Kaggle API setup (upload your kaggle.json first manually or through code)
from google.colab import files
files.upload()  # Upload kaggle.json when prompted

# Step 2: Move kaggle.json to the correct location
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename('kaggle.json', '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 0o600)

# Step 3: Download the Kaggle dataset
!kaggle datasets download -d shantanudhakadd/bank-customer-churn-prediction

# Step 4: Unzip the dataset
with zipfile.ZipFile("bank-customer-churn-prediction.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

print("Dataset downloaded and extracted.")


Saving kaggle.json to kaggle (1).json
Dataset URL: https://www.kaggle.com/datasets/shantanudhakadd/bank-customer-churn-prediction
License(s): other
Downloading bank-customer-churn-prediction.zip to /content
  0% 0.00/262k [00:00<?, ?B/s]
100% 262k/262k [00:00<00:00, 485MB/s]
Dataset downloaded and extracted.


In [13]:
# Load dataset
import pandas as pd

df = pd.read_csv('/content/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Data Preprocessing
- Handle missing values
- Convert categorical variables using one-hot encoding
- Encode target variable


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Drop irrelevant columns ('RowNumber', 'CustomerId', 'Surname')
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Encode target variable 'Exited'
label_encoder = LabelEncoder()
df['Exited'] = label_encoder.fit_transform(df['Exited'])

# One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Split data
X = df.drop('Exited', axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.8675

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.49      0.59       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000



In [12]:
import os
print(os.listdir('/content/'))

['.config', 'bank-customer-churn-prediction.zip', 'Churn_Modelling.csv', 'kaggle (1).json', 'sample_data']


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for convergence
lr_model.fit(X_train, y_train)

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Evaluate Gradient Boosting Model
print("Gradient Boosting Model Evaluation:")
y_pred_gb = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_gb))
print('\nClassification Report:\n', classification_report(y_test, y_pred_gb))
y_prob_gb = model.predict_proba(X_test)[:, 1]
print('AUC:', roc_auc_score(y_test, y_prob_gb))
print("-" * 50)

# Evaluate Logistic Regression Model
print("Logistic Regression Model Evaluation:")
y_pred_lr = lr_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print('\nClassification Report:\n', classification_report(y_test, y_pred_lr))
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]
print('AUC:', roc_auc_score(y_test, y_prob_lr))
print("-" * 50)

# Evaluate Random Forest Model
print("Random Forest Model Evaluation:")
y_pred_rf = rf_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('\nClassification Report:\n', classification_report(y_test, y_pred_rf))
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
print('AUC:', roc_auc_score(y_test, y_prob_rf))
print("-" * 50)

Gradient Boosting Model Evaluation:
Accuracy: 0.8675

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.49      0.59       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000

AUC: 0.8728962506590917
--------------------------------------------------
Logistic Regression Model Evaluation:
Accuracy: 0.8105

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.95      0.89      1607
           1       0.54      0.23      0.32       393

    accuracy                           0.81      2000
   macro avg       0.69      0.59      0.60      2000
weighted avg       0.78      0.81      0.78      2000

AUC: 0.7755604852181376
--------------------------------------------------
Random Forest Model Evaluation:
Accura

## Compare model performance

### Subtask:
Compare the evaluation results of different models to identify the best performing model for this dataset.


## Hyperparameter tuning

### Subtask:
Tune the hyperparameters of the best performing model to further improve its performance.


**Reasoning**:
Import necessary libraries, define a parameter grid for Gradient Boosting, and initialize GridSearchCV.



In [24]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200], # Reduced from [100, 200, 300]
    'learning_rate': [0.05, 0.1], # Reduced from [0.01, 0.1, 0.2]
    'max_depth': [3, 4], # Reduced from [3, 4, 5]
    'min_samples_split': [2, 5], # Reduced from [2, 5, 10]
    'min_samples_leaf': [1, 2] # Reduced from [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='roc_auc', cv=3, n_jobs=-1, verbose=0) # verbose set to 0

In [26]:
# Perform GridSearchCV to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best AUC Score:", best_score)

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_tuned = best_model.predict(X_test)
y_prob_tuned = best_model.predict_proba(X_test)[:, 1]

print("\nTuned Gradient Boosting Model Evaluation:")
print('Accuracy:', accuracy_score(y_test, y_pred_tuned))
print('\nClassification Report:\n', classification_report(y_test, y_pred_tuned))
print('AUC:', roc_auc_score(y_test, y_prob_tuned))

Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best AUC Score: 0.8646947691214937

Tuned Gradient Boosting Model Evaluation:
Accuracy: 0.872

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92      1607
           1       0.77      0.50      0.60       393

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000

AUC: 0.8695750620298281


In [27]:
import joblib
import os

# Define a directory to save models
model_dir = "trained_models"
os.makedirs(model_dir, exist_ok=True)

# Save the trained models
joblib.dump(model, os.path.join(model_dir, "gradient_boosting_model.pkl"))
joblib.dump(lr_model, os.path.join(model_dir, "logistic_regression_model.pkl"))
joblib.dump(rf_model, os.path.join(model_dir, "random_forest_model.pkl"))
joblib.dump(best_model, os.path.join(model_dir, "tuned_gradient_boosting_model.pkl"))

print("Models saved successfully in the 'trained_models' directory.")

Models saved successfully in the 'trained_models' directory.


You can then load the models in your project using `joblib.load()`:

In [28]:
import joblib
import os

# Define the directory where models are saved
model_dir = "trained_models"

# Load the trained models
loaded_gb_model = joblib.load(os.path.join(model_dir, "gradient_boosting_model.pkl"))
loaded_lr_model = joblib.load(os.path.join(model_dir, "logistic_regression_model.pkl"))
loaded_rf_model = joblib.load(os.path.join(model_dir, "random_forest_model.pkl"))
loaded_tuned_gb_model = joblib.load(os.path.join(model_dir, "tuned_gradient_boosting_model.pkl"))

print("Models loaded successfully.")

# You can now use the loaded models for predictions
# Example:
# new_data = ... # Prepare your new data in the same format as X_test
# predictions = loaded_tuned_gb_model.predict(new_data)

Models loaded successfully.


After running the saving code, you can download the `trained_models` directory from your Colab environment to your local machine.