In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_recall_curve, auc, roc_curve

In [93]:
train_data = pd.read_csv('../Dataset/train.csv')
test_data = pd.read_csv('../Dataset/test.csv')
train_data.drop(labels='Id', axis=1, inplace=True)

In [94]:
train_data

Unnamed: 0,age,number_dependent_family_members,monthly_income,number_of_credit_lines,real_estate_loans,ratio_debt_payment_to_income,credit_line_utilization,number_of_previous_late_payments_up_to_59_days,number_of_previous_late_payments_up_to_89_days,number_of_previous_late_payments_90_days_or_more,defaulted_on_loan
0,66.0,,4000.0,,1.0,0.569108,0.05488766900000001,0.0,0.0,0.0,0
1,61.0,2.0,4000.0,6.0,1.0,0.297176,0.10194991099999999,0.0,0.0,0.0,0
2,31.0,2.0,3040.0,8.0,0.0,0.160145,1.22713507,4.0,0.0,0.0,1
3,54.0,4.0,10218.0,5.0,0.0,0.067913,0.08327777900000001,0.0,0.0,0.0,0
4,29.0,0.0,4468.0,6.0,0.0,0.328261,0.317445504,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
72156,70.0,0.0,9584.0,13.0,3.0,0.429943,0.145213501,1.0,0.0,0.0,0
72157,51.0,,6374.0,15.0,2.0,0.667608,0.445703733,0.0,0.0,0.0,0
72158,58.0,0.0,4333.0,,1.0,0.413475,0.048434473,0.0,0.0,0.0,0
72159,58.0,2.0,6466.0,14.0,2.0,0.420288,0.447265776,0.0,0.0,0.0,0


In [95]:
#As in EDA file, credit_line_utilization column needs to be converted from object to float
test_data['credit_line_utilization'].replace(',','.', regex = True, inplace = True)
test_data["credit_line_utilization"] = pd.to_numeric(test_data["credit_line_utilization"])
train_data['credit_line_utilization'].replace(',','.',regex = True, inplace = True)
train_data["credit_line_utilization"] = pd.to_numeric(train_data["credit_line_utilization"])

In [96]:
X = train_data[["age", "number_dependent_family_members","monthly_income",
                            "number_of_credit_lines","real_estate_loans","ratio_debt_payment_to_income",
                            "credit_line_utilization","number_of_previous_late_payments_up_to_59_days",
                   "number_of_previous_late_payments_up_to_89_days",
                   "number_of_previous_late_payments_90_days_or_more"]]
y = train_data[["defaulted_on_loan"]]

In [98]:
numerical_feats = X.dtypes[X.dtypes != 'object'].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train_data.dtypes[train_data.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  10
Number of Categorical features:  0


In [99]:
#Function for printing the best parameters for the GridSearchCV
def get_best_score(grid):
    print("The best score is {:.2f}".format(grid.best_score_))    
    print("Best parameters", grid.best_params_)
    return grid.best_score_

Train-test split of our train data

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

After using K-neighbors Classifier, RandomForest Classifier, and Logistic Regression which were pretty slow, I decided on moving on with Catboost Classifier

Catboost Classifier

In [101]:
from catboost import CatBoostClassifier

In [115]:
cat_params = {
  'model__depth': [4, 5, 6],
  'model__learning_rate' : [0.01, 0.02],
  'model__iterations': [10, 20]
}

cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer()),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier())
])

In [116]:
grid_cat = GridSearchCV(cat_model, cat_params, cv = 5)

In [117]:
grid_cat.fit(X_train, y_train.values.ravel())

0:	learn: 0.6854270	total: 5.02ms	remaining: 45.1ms
1:	learn: 0.6778001	total: 9.44ms	remaining: 37.8ms
2:	learn: 0.6703484	total: 12.3ms	remaining: 28.7ms
3:	learn: 0.6631238	total: 15.3ms	remaining: 22.9ms
4:	learn: 0.6560207	total: 18.1ms	remaining: 18.1ms
5:	learn: 0.6489898	total: 20.4ms	remaining: 13.6ms
6:	learn: 0.6420973	total: 22.7ms	remaining: 9.74ms
7:	learn: 0.6353781	total: 25ms	remaining: 6.25ms
8:	learn: 0.6287396	total: 27.2ms	remaining: 3.02ms
9:	learn: 0.6221916	total: 29.6ms	remaining: 0us
0:	learn: 0.6854341	total: 2.31ms	remaining: 20.8ms
1:	learn: 0.6778174	total: 4.79ms	remaining: 19.2ms
2:	learn: 0.6703890	total: 6.71ms	remaining: 15.7ms
3:	learn: 0.6631743	total: 8.77ms	remaining: 13.2ms
4:	learn: 0.6560815	total: 10.8ms	remaining: 10.8ms
5:	learn: 0.6490711	total: 13ms	remaining: 8.7ms
6:	learn: 0.6421942	total: 15.2ms	remaining: 6.49ms
7:	learn: 0.6354801	total: 17.4ms	remaining: 4.34ms
8:	learn: 0.6288551	total: 19.6ms	remaining: 2.18ms
9:	learn: 0.6223470	

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('num_column',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90

In [118]:
sc_ridge = get_best_score(grid_cat)

The best score is 0.93
Best parameters {'model__depth': 6, 'model__iterations': 20, 'model__learning_rate': 0.02}


In [120]:
cat_params = {
  'model__depth': [7, 8, 9],
  'model__learning_rate' : [0.03, 0.04],
  'model__iterations': [50, 60]
}

cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer()),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier())
])

In [121]:
grid_cat = GridSearchCV(cat_model, cat_params, cv = 5)

In [122]:
grid_cat.fit(X_train, y_train.values.ravel())

0:	learn: 0.6700395	total: 6.77ms	remaining: 332ms
1:	learn: 0.6482273	total: 10.8ms	remaining: 259ms
2:	learn: 0.6278910	total: 14.9ms	remaining: 233ms
3:	learn: 0.6087878	total: 19ms	remaining: 218ms
4:	learn: 0.5903316	total: 22.8ms	remaining: 205ms
5:	learn: 0.5729683	total: 26.5ms	remaining: 194ms
6:	learn: 0.5566312	total: 30.5ms	remaining: 187ms
7:	learn: 0.5412812	total: 34.2ms	remaining: 179ms
8:	learn: 0.5265468	total: 37.8ms	remaining: 172ms
9:	learn: 0.5125951	total: 41.6ms	remaining: 167ms
10:	learn: 0.4990415	total: 45.1ms	remaining: 160ms
11:	learn: 0.4863670	total: 50.4ms	remaining: 160ms
12:	learn: 0.4743012	total: 54.4ms	remaining: 155ms
13:	learn: 0.4627871	total: 57.2ms	remaining: 147ms
14:	learn: 0.4516721	total: 60.5ms	remaining: 141ms
15:	learn: 0.4409600	total: 64.1ms	remaining: 136ms
16:	learn: 0.4309576	total: 67.6ms	remaining: 131ms
17:	learn: 0.4213692	total: 71.4ms	remaining: 127ms
18:	learn: 0.4120763	total: 74.9ms	remaining: 122ms
19:	learn: 0.4034245	tot

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('num_column',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90

In [123]:
sc_ridge = get_best_score(grid_cat)

The best score is 0.93
Best parameters {'model__depth': 8, 'model__iterations': 50, 'model__learning_rate': 0.03}


After using two different parameters and getting the same score of 0.93, I decided to use one of the best parameters out of two dictionaries. Namely, {'model__depth': 8, 'model__iterations': 50, 'model__learning_rate': 0.03}

In [128]:
cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer()),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier(
      depth = 8, 
      iterations = 50,
      learning_rate = 0.03
    ))
])

In [129]:
cat_model.fit(X_train, y_train.values.ravel())

0:	learn: 0.6699633	total: 11.9ms	remaining: 583ms
1:	learn: 0.6481951	total: 19.4ms	remaining: 466ms
2:	learn: 0.6279330	total: 26.3ms	remaining: 413ms
3:	learn: 0.6086052	total: 31.9ms	remaining: 367ms
4:	learn: 0.5903854	total: 35.6ms	remaining: 321ms
5:	learn: 0.5731110	total: 40.4ms	remaining: 296ms
6:	learn: 0.5566259	total: 45.4ms	remaining: 279ms
7:	learn: 0.5410032	total: 50.8ms	remaining: 266ms
8:	learn: 0.5262133	total: 56.2ms	remaining: 256ms
9:	learn: 0.5121461	total: 61.4ms	remaining: 245ms
10:	learn: 0.4987215	total: 67.2ms	remaining: 238ms
11:	learn: 0.4859466	total: 73.3ms	remaining: 232ms
12:	learn: 0.4740134	total: 78.3ms	remaining: 223ms
13:	learn: 0.4623116	total: 83.6ms	remaining: 215ms
14:	learn: 0.4513690	total: 88.8ms	remaining: 207ms
15:	learn: 0.4407114	total: 93.7ms	remaining: 199ms
16:	learn: 0.4305956	total: 98.6ms	remaining: 191ms
17:	learn: 0.4208921	total: 104ms	remaining: 184ms
18:	learn: 0.4117463	total: 107ms	remaining: 174ms
19:	learn: 0.4028994	tot

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num_column',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90_days_or_more'],
      dtype='object'))])),
                ('model',
                 <catboost.core.CatBoostClassifier object at 0x7f93ab3e9190>)])

In [131]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
print("Train data:")
print("Accuracy score: ", accuracy_score(cat_model.predict(X_train),y_train))
print("Recall score: ", recall_score(cat_model.predict(X_train),y_train))
print("F1 score: ", f1_score(cat_model.predict(X_train),y_train))
print("Roc_auc score: ", roc_auc_score(cat_model.predict(X_train),y_train))
print("__________________________________")
print("Test data")
print("Accuracy score: ", accuracy_score(cat_model.predict(X_test),y_test))
print("Recall score: ", recall_score(cat_model.predict(X_test),y_test))
print("F1 score: ", f1_score(cat_model.predict(X_test),y_test))
print("Roc_auc score: ", roc_auc_score(cat_model.predict(X_test),y_test))

Train data:
Accuracy score:  0.9345343680709535
Recall score:  0.6523736600306279
F1 score:  0.19385665529010238
Roc_auc score:  0.7951770482807861
__________________________________
Test data
Accuracy score:  0.9336511279862535
Recall score:  0.6528925619834711
F1 score:  0.20885657633840055
Roc_auc score:  0.7951804795422159


In [133]:
test_data.drop(["Id"], axis = 1, inplace = True)

cat_trial_df = pd.DataFrame({
  "Id": range(1, 48109),
  "Predicted": cat_model.predict_proba(test_data)[:,1]
})

In [136]:
cat_trial_df.to_csv("test_csv",index = False)