In [24]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
df = pd.read_csv('data/train_preprocessed.csv')
df.head()

Unnamed: 0,SK_ID_CURR,W_DEFAULT,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,AMT_LESS_MAX,DAYS_LATE_AVG,DAYS_LATE_MAX,SK_DPD_DEF,LESS_GRANTED,REJECTED_APPLICATIONS,AMT_CREDIT_SUM_TOTAL,AMT_CREDIT_SUM_DEBT_TOTAL,AMT_CREDIT_SUM_LIMIT_TOTAL,AMT_CREDIT_SUM_OVERDUE_TOTAL
0,100002,1.0,0,1,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,0.0,20.421053,31.0,0.0,0.0,0.0,865055.565,245781.0,31988.565,0.0
1,100003,0.0,0,0,0,270000.0,State servant,Higher education,Married,House / apartment,...,0.0,7.16,14.0,0.0,1.0,0.0,1017400.5,0.0,810000.0,0.0
2,100004,0.0,1,1,0,67500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,0.0,7.666667,11.0,0.0,1.0,0.0,189037.8,0.0,0.0,0.0
3,100006,0.0,0,1,0,135000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,...,0.0,19.375,77.0,0.0,2.0,1.0,,,,
4,100007,0.0,0,1,0,121500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,22655.655,4.590909,31.0,0.0,2.0,0.0,146250.0,0.0,0.0,0.0


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


categoricals = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
cols_to_scale = [
    'Avg_Monthly_Balance','Months_Fully_Paid', 
    'Months_Minimum_Paid', 'Months_Minimum_Paid', 
    'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'OWN_CAR_AGE',
    'Age_Years','Employed_Months', 'AMT_LESS_TOTAL',
    'AMT_LESS_MAX', 'DAYS_LATE_MAX', 'SK_DPD_DEF',
    'AMT_CREDIT_SUM_TOTAL', 'AMT_CREDIT_SUM_DEBT_TOTAL',
    'AMT_CREDIT_SUM_LIMIT_TOTAL', 'AMT_CREDIT_SUM_OVERDUE_TOTAL',
    'Tenure_Months', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
]
others = df.columns.difference(categoricals + cols_to_scale)

preproc = ColumnTransformer([
    ('cat', OneHotEncoder(), categoricals),
    ('scaled', MinMaxScaler(), cols_to_scale),
], remainder='passthrough')



In [28]:
X = df.drop(columns='W_DEFAULT')
y = df['W_DEFAULT']

X = preproc.fit_transform(X)
X[np.isnan(X)] = 0  #naively fill NA with 0

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

knn = KNeighborsClassifier()
lr = LogisticRegression(max_iter=100_000)
svc = SVC(probability=True,)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

models = [
    knn, 
    lr, 
    # svc, 
    dt, 
    rf, 
    gbm
]
model_names = [
    'KNN',
    'Logistic Regression',
    # 'SVC', 
    'Decision Tree',
    'Random Forest', 
    'Gradient Boosting'
]

In [30]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.metrics import precision_recall_curve, auc, make_scorer

def auc_prc(y_true, y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

auc_prc_score = make_scorer(auc_prc, needs_proba=True)

In [31]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

for model_name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    print(f'{model_name} results:')

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_proba[:,1])}')
    # print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')


KNN results:
F1 Score: 0.011934954497985976
MCC: 0.006755014201058112
ROC AUC: 0.508936392197101
Logistic Regression results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.5043668020431477
Decision Tree results:
F1 Score: 0.1645187421754179
MCC: 0.0842046025042393
ROC AUC: 0.5447187178685566
Random Forest results:
F1 Score: 0.010949475989363364
MCC: 0.05326561676195918
ROC AUC: 0.731615769561407
Gradient Boosting results:
F1 Score: 0.033659730722154224
MCC: 0.08712335388459061
ROC AUC: 0.7600829531555864


In [32]:
results = {}

for model_name, model in zip(model_names, models):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[model_name] = {
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba[:,1]),
        'PRC AUC': auc_prc(y_test, y_pred),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    print(f'Confusion Matrix for {model_name}')
    cm = confusion_matrix(y_test, y_pred)
    display(pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']))

Confusion Matrix for KNN


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70215,326
Actual Positive,6297,40


Confusion Matrix for Logistic Regression


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70541,0
Actual Positive,6337,0


Confusion Matrix for Decision Tree


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,64416,6125
Actual Positive,5220,1117


Confusion Matrix for Random Forest


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70520,21
Actual Positive,6302,35


Confusion Matrix for Gradient Boosting


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70452,89
Actual Positive,6227,110


In [33]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,Precision,Recall,F1,ROC AUC,PRC AUC,MCC
KNN,0.10929,0.006312,0.011935,0.508936,0.098755,0.006755
Logistic Regression,0.0,0.0,0.0,0.504367,0.541215,0.0
Decision Tree,0.154239,0.176266,0.164519,0.544719,0.199203,0.084205
Random Forest,0.625,0.005523,0.010949,0.731616,0.356249,0.053266
Gradient Boosting,0.552764,0.017358,0.03366,0.760083,0.32556,0.087123


In [35]:
test_df = pd.read_csv('data/test_preprocessed.csv')
X_submit = test_df.drop(columns='W_DEFAULT')
X_submit = preproc.transform(X_submit)
X_submit[np.isnan(X_submit)] = 0

y_submit_rf = rf.predict_proba(X_submit)
y_submit_gbm = gbm.predict_proba(X_submit)

pd.Series(y_submit_rf[:,1], index=test_df.SK_ID_CURR, name='TARGET').to_csv('submits/baseline_rf.csv')
pd.Series(y_submit_gbm[:,1], index=test_df.SK_ID_CURR, name='TARGET').to_csv('submits/baseline_gbm.csv')

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

models = {
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10_000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    # 'Gradient Boosting': GradientBoostingClassifier(), # remove for performance
    'XGBoost': XGBClassifier() # XGBoost is much faster than sklearn GBM (order of magnitude)
}

In [41]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

seed = None
ros = RandomOverSampler(random_state=seed)
rus = RandomUnderSampler(random_state=seed)
smote = SMOTE(random_state=seed)
adasyn = ADASYN(random_state=seed)
tomek = TomekLinks()
smotetomek = SMOTETomek(random_state=seed)

methods = {
    'Original': 'passthrough',
    'Random Oversampling': ros,
    'Random Undersampling': rus,
    'SMOTE': smote,
    'ADASYN': adasyn,
    # 'Tomek': tomek, #remove for performance
    # 'SMOTETomek': smotetomek,
}

In [42]:
scoring = {
    'auc_prc': auc_prc_score, 
    'precision': make_scorer(precision_score), 
    'recall': make_scorer(recall_score),
    'mcc': make_scorer(matthews_corrcoef),
    'auc_roc': make_scorer(roc_auc_score)
}

In [43]:
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline

all_results = []
for model_name, model in models.items():
    results_model = {}

    for method, resampler in methods.items():

        pipeline = Pipeline(
            [
                (method, resampler),
                [model_name, model]
            ]
        )

        scores = cross_validate(pipeline, X, y, scoring=scoring)
        scores = {key: np.mean(values) for key, values in scores.items()}

        results_model[method] = {
            'Precision': scores['test_precision'],
            'Recall': scores['test_recall'],
            'AUC PRC': scores['test_auc_prc'],
            'AUC ROC': scores['test_auc_roc'],
            'MCC': scores['test_mcc'],
            'Training Time': scores['fit_time']
        }
    
    results_model = pd.DataFrame.from_dict(results_model, orient='index')
    print(f'Results for {model_name}')
    display(results_model)

    results_model['Model'] = model_name
    all_results.append(results_model.reset_index(names='Sampler'))

all_results = pd.concat(all_results, axis=0).reset_index(drop=True)

Results for kNN


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.04612,0.333333,0.22193,0.491031,-0.011218,0.102917
Random Oversampling,0.079295,0.380101,0.249584,0.493745,-0.008283,0.501126
Random Undersampling,0.085146,0.419376,0.229086,0.498651,0.000485,0.088493
SMOTE,0.081381,0.371843,0.246213,0.493559,-0.0081,1.530547
ADASYN,0.076975,0.373011,0.245787,0.493763,-0.008221,9.325237


Results for Logistic Regression


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.0,0.0,0.089669,0.5,0.0,0.927746
Random Oversampling,0.092067,0.415347,0.086885,0.524368,0.03046,1.558128
Random Undersampling,0.091293,0.333897,0.08597,0.517781,0.023876,0.198673
SMOTE,0.09209,0.429043,0.087617,0.527416,0.032362,2.597414
ADASYN,0.091146,0.140987,0.085675,0.509663,0.015247,10.029398


Results for Decision Tree


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.141442,0.191944,0.19931,0.543433,0.076468,9.900624
Random Oversampling,0.134184,0.139859,0.171741,0.531286,0.06046,13.317199
Random Undersampling,0.105922,0.551742,0.346926,0.571759,0.079151,1.151425
SMOTE,0.136003,0.201047,0.200774,0.542986,0.07293,14.28986
ADASYN,0.135779,0.196375,0.198515,0.542169,0.072077,22.352846


Results for Random Forest


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.6149,0.004632,0.22123,0.502185,0.048306,15.569016
Random Oversampling,0.516423,0.017523,0.221406,0.507819,0.067803,25.771734
Random Undersampling,0.163959,0.673273,0.219962,0.685791,0.215083,1.819045
SMOTE,0.557977,0.004471,0.209029,0.502076,0.044433,31.843879
ADASYN,0.575402,0.004995,0.210468,0.502333,0.047883,38.076317


Results for XGBoost


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.341426,0.055911,0.227935,0.524362,0.112111,2.451453
Random Oversampling,0.191458,0.497644,0.215136,0.646825,0.196767,4.500367
Random Undersampling,0.165123,0.59569,0.209818,0.66157,0.19515,1.106047
SMOTE,0.437915,0.038872,0.224173,0.517204,0.107701,9.156399
ADASYN,0.445323,0.042578,0.225498,0.518758,0.112228,17.338161
