In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_recall_curve, auc, roc_curve

In [None]:
train_data = pd.read_csv('../Dataset/train.csv')
test_data = pd.read_csv('../Dataset/test.csv')
train_data.drop(labels='Id', axis=1, inplace=True)

In [None]:
train_data

In [None]:
#As in EDA file, credit_line_utilization column needs to be converted from object to float
test_data['credit_line_utilization'].replace(',','.', regex = True, inplace = True)
test_data["credit_line_utilization"] = pd.to_numeric(test_data["credit_line_utilization"])
train_data['credit_line_utilization'].replace(',','.',regex = True, inplace = True)
train_data["credit_line_utilization"] = pd.to_numeric(train_data["credit_line_utilization"])

In [None]:
X = train_data[["age", "number_dependent_family_members","monthly_income",
                            "number_of_credit_lines","real_estate_loans","ratio_debt_payment_to_income",
                            "credit_line_utilization","number_of_previous_late_payments_up_to_59_days",
                   "number_of_previous_late_payments_up_to_89_days",
                   "number_of_previous_late_payments_90_days_or_more"]]
y = train_data[["defaulted_on_loan"]]

In [None]:
numerical_feats = X.dtypes[X.dtypes != 'object'].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train_data.dtypes[train_data.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [None]:
#Function for printing the best parameters for the GridSearchCV
def get_best_score(grid):
    print("The best score is {:.2f}".format(grid.best_score_))    
    print("Best parameters", grid.best_params_)
    return grid.best_score_

Train-test split of our train data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

After using K-neighbors Classifier, RandomForest Classifier, and Logistic Regression which were pretty slow, I decided on moving on with Catboost Classifier

Catboost Classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat_params = {
  'model__depth': [4, 5, 6],
  'model__learning_rate' : [0.01, 0.02],
  'model__iterations': [10, 20]
}

cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer()),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier())
])

In [27]:
import optuna
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score

def make_model(l2_leaf_reg, learning_rate, imp_strategy):
    model = Pipeline(steps = [
      ("preprocessing", Pipeline(steps = [
            ("imputer",SimpleImputer(strategy = imp_strategy)),
            ("scaler", StandardScaler())
             ])),
      ("model", CatBoostClassifier(depth = 8, l2_leaf_reg = l2_leaf_reg,
                                   learning_rate=learning_rate, silent = True))])
    return model

In [28]:
def objective(trial):
    imp_strategy = trial.suggest_categorical('imp_strategy', [
        'median', 'mean', 'most_frequent'])
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 15)
    learning_rate = trial.suggest_float('learning_rate',0.1e-4, 0.3)
    
    model = make_model(l2_leaf_reg, learning_rate, 
                       imp_strategy)
    return cross_val_score(model, X_train, y_train, cv=3, scoring=make_scorer(roc_auc_score)).mean()

In [29]:
study_cat = optuna.create_study(direction = 'maximize')

[32m[I 2022-04-01 16:57:32,930][0m A new study created in memory with name: no-name-1902d718-9326-439a-a9ea-61496e17abc4[0m


In [31]:
study_cat.optimize(objective, n_trials=10)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2022-04-01 16:59:57,407][0m Trial 2 finished with value: 0.5792648351065445 and parameters: {'imp_strategy': 'mean', 'l2_leaf_reg': 8.286896371647874, 'learning_rate': 0.028434311238331927}. Best is trial 0 with value: 0.5900985328749416.[0m
[32m[I 2022-04-01 17:00:21,561][0m Trial 3 finished with value: 0.5871336234225413 and parameters: {'imp_strategy': 'most_frequent', 'l2_leaf_reg': 2.298095736771187, 'learning_rate': 0.17623602723228154}. Best is trial 0 with value: 0.5900985328749416.[0m
[32m[I 2022-04-01 17:00:45,755][0m Trial 4 finished with value: 0.5834529277428261 and parameters: {'imp_strategy': 'median', 'l2_leaf_reg': 3.9743680102442935, 'learning_rate': 0.05033442129126743}. Best is trial 0 with value: 0.5900985328749416.[0m
[32m[I 2022-04-01 17:01:09,681][0m Trial 5 finished with value: 0.5765184527813071 and parameters: {'imp_strategy': 'median', 'l2_leaf_

In [32]:
study_cat.best_params

{'imp_strategy': 'most_frequent',
 'l2_leaf_reg': 8.069839986047254,
 'learning_rate': 0.29037681714266655}

In [None]:
grid_cat = GridSearchCV(cat_model, cat_params, cv = 5)

In [None]:
grid_cat.fit(X_train, y_train.values.ravel())

In [None]:
get_best_score(grid_cat)

In [None]:
cat_params = {
  'model__depth': [7, 8, 9],
  'model__learning_rate' : [0.03, 0.04],
  'model__iterations': [50, 60]
}

cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer()),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier())
])

In [None]:
grid_cat = GridSearchCV(cat_model, cat_params, cv = 5)

In [None]:
grid_cat.fit(X_train, y_train.values.ravel())

In [None]:
get_best_score(grid_cat)

After using two different parameters and getting the same score of 0.93, I decided to use one of the best parameters out of two dictionaries. Namely, {'model__depth': 8, 'model__iterations': 50, 'model__learning_rate': 0.03}

In [33]:
cat_model = Pipeline(steps = [
    ("preprocessing", ColumnTransformer(transformers = [
        ("num_column", Pipeline(steps = [
            ("imputer", SimpleImputer(strategy='most_frequent')),
            ("scaler", StandardScaler()),
        ]), numerical_feats),
    ])),
    ("model", CatBoostClassifier(
      depth = 8, 
      iterations = 50,
      l2_leaf_reg = 8.069839986047254,
      learning_rate = 0.29037681714266655
    ))
])

In [34]:
cat_model.fit(X_train, y_train.values.ravel())

0:	learn: 0.5001075	total: 9.23ms	remaining: 452ms
1:	learn: 0.3916225	total: 13.8ms	remaining: 331ms
2:	learn: 0.3254346	total: 18.2ms	remaining: 285ms
3:	learn: 0.2818455	total: 22.9ms	remaining: 263ms
4:	learn: 0.2524425	total: 28.1ms	remaining: 253ms
5:	learn: 0.2328753	total: 33.7ms	remaining: 247ms
6:	learn: 0.2200882	total: 37.7ms	remaining: 232ms
7:	learn: 0.2109194	total: 43.2ms	remaining: 227ms
8:	learn: 0.2040907	total: 48ms	remaining: 219ms
9:	learn: 0.1996408	total: 52.9ms	remaining: 212ms
10:	learn: 0.1964761	total: 58.6ms	remaining: 208ms
11:	learn: 0.1944797	total: 63.7ms	remaining: 202ms
12:	learn: 0.1923044	total: 68.6ms	remaining: 195ms
13:	learn: 0.1911453	total: 74.2ms	remaining: 191ms
14:	learn: 0.1901130	total: 79.3ms	remaining: 185ms
15:	learn: 0.1893754	total: 84.3ms	remaining: 179ms
16:	learn: 0.1888883	total: 89.5ms	remaining: 174ms
17:	learn: 0.1883484	total: 94.5ms	remaining: 168ms
18:	learn: 0.1880147	total: 99.5ms	remaining: 162ms
19:	learn: 0.1876557	tot

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num_column',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'number_dependent_family_members', 'monthly_income',
       'number_of_credit_lines', 'real_estate_loans',
       'ratio_debt_payment_to_income', 'credit_line_utilization',
       'number_of_previous_late_payments_up_to_59_days',
       'number_of_previous_late_payments_up_to_89_days',
       'number_of_previous_late_payments_90_days_or_more'],
      dtype='object'))])),
                ('model',
                 <catboost.core.CatBoostClassifier object at 0x7fdd82362e80>)])

In [35]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
print("Train data:")
print("Accuracy score: ", accuracy_score(cat_model.predict(X_train),y_train))
print("Recall score: ", recall_score(cat_model.predict(X_train),y_train))
print("F1 score: ", f1_score(cat_model.predict(X_train),y_train))
print("Roc_auc score: ", roc_auc_score(cat_model.predict(X_train),y_train))
print("__________________________________")
print("Test data")
print("Accuracy score: ", accuracy_score(cat_model.predict(X_test),y_test))
print("Recall score: ", recall_score(cat_model.predict(X_test),y_test))
print("F1 score: ", f1_score(cat_model.predict(X_test),y_test))
print("Roc_auc score: ", roc_auc_score(cat_model.predict(X_test),y_test))

Train data:
Accuracy score:  0.9381744271988175
Recall score:  0.6952636282394995
F1 score:  0.31742146062831494
Roc_auc score:  0.8192832923937445
__________________________________
Test data
Accuracy score:  0.9352031483842359
Recall score:  0.5804749340369393
F1 score:  0.27346177750155376
Roc_auc score:  0.7616450086332358


In [37]:
# test_data.drop(["Id"], axis = 1, inplace = True)

cat_trial_df = pd.DataFrame({
  "Id": range(1, 48109),
  "Predicted": cat_model.predict_proba(test_data)[:,1]
})

In [38]:
cat_trial_df.to_csv("test_csv",index = False)