This notebook aims to create the best model to predict credit card defaults

In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, f1_score,classification_report
from scipy.stats import randint,uniform
import pickle as pkl

In [88]:
# load the cleaned data
df=pd.read_csv('Clean_df.csv')
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0.02381,2,2,1,24,2,2,0,0,0,...,0.581358,0.412409,0.751811,0.0,0.066243,0.0,0.0,0.0,0.0,1
1,0.261905,2,2,2,26,0,2,0,0,0,...,0.592548,0.429927,0.75903,0.0,0.096144,0.1,0.104487,0.0,0.208333,1
2,0.190476,2,2,2,34,0,0,0,0,0,...,0.630367,0.488203,0.786233,0.147379,0.144216,0.1,0.104487,0.105263,0.520833,0
3,0.095238,2,2,1,37,0,0,0,0,0,...,0.678185,0.559247,0.817222,0.194175,0.194114,0.12,0.114935,0.112526,0.104167,0
4,0.095238,1,2,1,57,0,0,0,0,0,...,0.652968,0.50949,0.794163,0.194175,1.0,1.0,0.94038,0.072526,0.070729,0


In [89]:
X=df.drop(['default.payment.next.month'],axis=1)
y=df['default.payment.next.month']

In [90]:
# Considering only important features
X=X[['PAY_0', 'PAY_2', 'PAY_4', 'PAY_3', 'PAY_6']]

In [91]:
X

Unnamed: 0,PAY_0,PAY_2,PAY_4,PAY_3,PAY_6
0,2,2,0,0,0
1,0,2,0,0,2
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
29995,0,0,0,0,0
29996,0,0,0,0,0
29997,4,3,0,2,0
29998,1,0,0,0,0


In [92]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [93]:
# Define scoring
scoring = {
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

- We are using recall and f1 score because our dataset is imbalanced and we want to reduce False Negatives 
- Our model should not predict that a customer will not default but actually he defaults

# Hyper Parameter Tuning

## Logistic Regression

In [94]:
# Logistic Regression
log_reg = LogisticRegression(solver='liblinear', class_weight='balanced')
log_reg_params = {
    'C':uniform(0.01,10),
    'penalty': ['l1', 'l2']
}

In [95]:
log_reg_search = RandomizedSearchCV(log_reg, log_reg_params, n_iter=50, scoring=scoring, refit='recall', cv=StratifiedKFold(5), random_state=42,verbose=5,n_jobs=-1)
log_reg_search.fit(X_train, y_train)
print("Best parameters for Logistic Regression:", log_reg_search.best_params_)
print("Best recall score for Logistic Regression:", log_reg_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END C=3.7554011884736247, penalty=l1; f1: (test=0.528) recall: (test=0.571) total time=   0.0s
[CV 2/5] END C=3.7554011884736247, penalty=l1; f1: (test=0.536) recall: (test=0.564) total time=   0.0s
[CV 4/5] END C=3.7554011884736247, penalty=l1; f1: (test=0.506) recall: (test=0.533) total time=   0.0s
[CV 3/5] END C=3.7554011884736247, penalty=l1; f1: (test=0.532) recall: (test=0.565) total time=   0.0s
[CV 5/5] END C=3.7554011884736247, penalty=l1; f1: (test=0.516) recall: (test=0.567) total time=   0.0s
[CV 1/5] END C=1.844347898661638, penalty=l2; f1: (test=0.528) recall: (test=0.571) total time=   0.0s
[CV 2/5] END C=1.844347898661638, penalty=l2; f1: (test=0.536) recall: (test=0.564) total time=   0.0s
[CV 3/5] END C=1.844347898661638, penalty=l2; f1: (test=0.532) recall: (test=0.565) total time=   0.0s
[CV 4/5] END C=1.844347898661638, penalty=l2; f1: (test=0.506) recall: (test=0.533) total time=   0.0s
[CV 5/

## Support Vector Machines

In [96]:
# Support Vector Classifier
svc = SVC(class_weight='balanced')
svc_params = {
    'C': uniform(0.01, 10),
    'gamma': uniform(0.001, 1),
    'kernel': ['linear', 'rbf']
}

In [97]:
svc_search = RandomizedSearchCV(svc, svc_params, n_iter=50, scoring=scoring, refit='recall', cv=StratifiedKFold(5), random_state=42, n_jobs=-1,verbose=5)
svc_search.fit(X_train, y_train)
print("Best parameters for SVC:", svc_search.best_params_)
print("Best recall score for SVC:", svc_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 4/5] END C=3.7554011884736247, gamma=0.9517143064099162, kernel=linear; f1: (test=0.507) recall: (test=0.452) total time=  12.3s
[CV 1/5] END C=3.7554011884736247, gamma=0.9517143064099162, kernel=linear; f1: (test=0.519) recall: (test=0.532) total time=  12.8s
[CV 2/5] END C=3.7554011884736247, gamma=0.9517143064099162, kernel=linear; f1: (test=0.530) recall: (test=0.548) total time=  12.8s
[CV 3/5] END C=3.7554011884736247, gamma=0.9517143064099162, kernel=linear; f1: (test=0.519) recall: (test=0.530) total time=  13.0s
[CV 5/5] END C=3.7554011884736247, gamma=0.9517143064099162, kernel=linear; f1: (test=0.520) recall: (test=0.481) total time=  12.4s
[CV 1/5] END C=7.806910002727692, gamma=0.597850157946487, kernel=rbf; f1: (test=0.519) recall: (test=0.617) total time=  16.0s
[CV 3/5] END C=7.806910002727692, gamma=0.597850157946487, kernel=rbf; f1: (test=0.526) recall: (test=0.582) total time=  15.9s
[CV 2/5] END C=7.

## Random Forest

In [98]:
# Random Forest Classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_params = {
    'n_estimators': randint(10, 200),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

In [99]:
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=50, scoring=scoring, refit='recall', cv=StratifiedKFold(5), n_jobs=-1,verbose=5)
rf_search.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_search.best_params_)
print("Best recall score for Random Forest:", rf_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 2/5] END max_depth=3, max_features=sqrt, min_samples_leaf=18, min_samples_split=16, n_estimators=164; f1: (test=0.540) recall: (test=0.584) total time=   0.8s
[CV 3/5] END max_depth=3, max_features=sqrt, min_samples_leaf=18, min_samples_split=16, n_estimators=164; f1: (test=0.535) recall: (test=0.581) total time=   0.8s
[CV 4/5] END max_depth=3, max_features=sqrt, min_samples_leaf=18, min_samples_split=16, n_estimators=164; f1: (test=0.513) recall: (test=0.555) total time=   0.8s
[CV 1/5] END max_depth=12, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=82; f1: (test=nan) recall: (test=nan) total time=   0.0s
[CV 2/5] END max_depth=12, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=82; f1: (test=nan) recall: (test=nan) total time=   0.0s
[CV 3/5] END max_depth=12, max_features=auto, min_samples_leaf=3, min_samples_split=4, n_estimators=82; f1: (test=nan) recall: (test=na

85 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
61 fits failed with the following error:
Traceback (most recent call last):
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packag

Best parameters for Random Forest: {'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 18, 'min_samples_split': 18, 'n_estimators': 107}
Best recall score for Random Forest: 0.6118775812771365


## Gradient Boosting Classifier

In [100]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)
gbc_params = {
    'learning_rate': uniform(0.01, 1),
    'n_estimators': randint(10, 200),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'subsample': uniform(0.5, 0.5)
}

In [101]:
gbc_search = RandomizedSearchCV(gbc, gbc_params, n_iter=50, scoring=scoring, refit='recall', cv=StratifiedKFold(5), random_state=42, n_jobs=-1,verbose=1)
gbc_search.fit(X_train, y_train)
print("Best parameters for Gradient Boosting:", gbc_search.best_params_)
print("Best recall score for Gradient Boosting:", gbc_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.7541705230565623, 'max_depth': 3, 'min_samples_leaf': 17, 'min_samples_split': 2, 'n_estimators': 185, 'subsample': 0.5468373839140462}
Best recall score for Gradient Boosting: 0.37178562603692333


# Evaluating Models

In [102]:
# Evaluating the best models
best_models = {
    "Logistic Regression": log_reg_search.best_estimator_,
    "SVC": svc_search.best_estimator_,
    "Random Forest": rf_search.best_estimator_,
    "Gradient Boosting": gbc_search.best_estimator_
}

In [105]:
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} classification report:")
    print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1-Score: {f1_score(y_test, y_pred)}")


Logistic Regression classification report:
              precision    recall  f1-score   support

     Class 0       0.87      0.83      0.85      4687
     Class 1       0.48      0.57      0.52      1313

    accuracy                           0.77      6000
   macro avg       0.68      0.70      0.68      6000
weighted avg       0.79      0.77      0.78      6000

Recall: 0.5689261233815689
F1-Score: 0.5192909280500522

SVC classification report:
              precision    recall  f1-score   support

     Class 0       0.88      0.78      0.83      4687
     Class 1       0.44      0.62      0.52      1313

    accuracy                           0.74      6000
   macro avg       0.66      0.70      0.67      6000
weighted avg       0.78      0.74      0.76      6000

Recall: 0.623000761614623
F1-Score: 0.5164141414141414

Random Forest classification report:
              precision    recall  f1-score   support

     Class 0       0.88      0.80      0.84      4687
     Class 1    

#### Observation
- SVC gave the best recall. Hence let's choose the svc classifier

In [106]:
svc=best_models['SVC']
svc

In [None]:
with open('model.pkl','wb') as f:
    pkl.dump(svc,f)