In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
train_df = pd.read_csv('data/train_51f_fill0.csv')
test_df = pd.read_csv('data/test_51f_fill0.csv')

train_df.head()

Unnamed: 0,SK_ID_CURR,W_DEFAULT,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,AMT_LESS_MAX,DAYS_LATE_AVG,DAYS_LATE_MAX,SK_DPD_DEF,LESS_GRANTED,REJECTED_APPLICATIONS,AMT_CREDIT_SUM_TOTAL,AMT_CREDIT_SUM_DEBT_TOTAL,AMT_CREDIT_SUM_LIMIT_TOTAL,AMT_CREDIT_SUM_OVERDUE_TOTAL
0,100002,1.0,0,1,0,202500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,0.0,20.421053,31.0,0.0,0.0,0.0,865055.565,245781.0,31988.565,0.0
1,100003,0.0,0,0,0,270000.0,State servant,Higher education,Married,House / apartment,...,0.0,7.16,14.0,0.0,1.0,0.0,1017400.5,0.0,810000.0,0.0
2,100004,0.0,1,1,0,67500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,0.0,7.666667,11.0,0.0,1.0,0.0,189037.8,0.0,0.0,0.0
3,100006,0.0,0,1,0,135000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,...,0.0,19.375,77.0,0.0,2.0,1.0,,,,
4,100007,0.0,0,1,0,121500.0,Working,Secondary / secondary special,Single / not married,House / apartment,...,22655.655,4.590909,31.0,0.0,2.0,0.0,146250.0,0.0,0.0,0.0


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


categoricals = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
cols_to_scale = [
    'Avg_Monthly_Balance','Months_Fully_Paid', 
    'Months_Minimum_Paid', 'Months_Minimum_Paid', 
    'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'OWN_CAR_AGE',
    'Age_Years','Employed_Months', 'AMT_LESS_TOTAL',
    'AMT_LESS_MAX', 'DAYS_LATE_MAX', 'SK_DPD_DEF',
    'AMT_CREDIT_SUM_TOTAL', 'AMT_CREDIT_SUM_DEBT_TOTAL',
    'AMT_CREDIT_SUM_LIMIT_TOTAL', 'AMT_CREDIT_SUM_OVERDUE_TOTAL',
    'Tenure_Months', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
]
others = train_df.columns.difference(categoricals + cols_to_scale)

preproc = ColumnTransformer([
    ('cat', OneHotEncoder(), categoricals),
    ('scaled', MinMaxScaler(), cols_to_scale),
], remainder='passthrough')



In [11]:
X = train_df.drop(columns='W_DEFAULT')
y = train_df['W_DEFAULT']

X = preproc.fit_transform(X)
X[np.isnan(X)] = 0  #naively fill NA with 0

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

knn = KNeighborsClassifier()
lr = LogisticRegression(max_iter=100_000)
svc = SVC(probability=True,)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

models = [
    knn, 
    lr, 
    # svc, 
    dt, 
    rf, 
    gbm
]
model_names = [
    'KNN',
    'Logistic Regression',
    # 'SVC', 
    'Decision Tree',
    'Random Forest', 
    'Gradient Boosting'
]

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.metrics import precision_recall_curve, auc, make_scorer

def auc_prc(y_true, y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

auc_prc_score = make_scorer(auc_prc, needs_proba=True)

In [14]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

for model_name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    print(f'{model_name} results:')

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_proba[:,1])}')
    # print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')


KNN results:
F1 Score: 0.010287443267776097
MCC: 0.003945618439413269
ROC AUC: 0.5073853291356737
Logistic Regression results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.4987592931219931
Decision Tree results:
F1 Score: 0.16058832357796787
MCC: 0.08142807331454578
ROC AUC: 0.5430023226244457
Random Forest results:
F1 Score: 0.008553777918580708
MCC: 0.04164271488598126
ROC AUC: 0.7366237483854887
Gradient Boosting results:
F1 Score: 0.03261375990060568
MCC: 0.08955369912432833
ROC AUC: 0.7646702524064041


In [18]:
results = {}

for model_name, model in zip(model_names, models):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[model_name] = {
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba[:,1]),
        'PRC AUC': auc_prc(y_test, y_pred),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    print(f'Confusion Matrix for {model_name}')
    cm = confusion_matrix(y_test, y_pred)
    display(pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']))

Confusion Matrix for KNN


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70302,315
Actual Positive,6227,34


Confusion Matrix for Logistic Regression


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70617,0
Actual Positive,6261,0


Confusion Matrix for Decision Tree


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,64622,5995
Actual Positive,5191,1070


Confusion Matrix for Random Forest


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70592,25
Actual Positive,6234,27


Confusion Matrix for Gradient Boosting


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,70544,73
Actual Positive,6156,105


In [19]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,Precision,Recall,F1,ROC AUC,PRC AUC,MCC
KNN,0.097421,0.00543,0.010287,0.507385,0.091925,0.003946
Logistic Regression,0.0,0.0,0.0,0.498759,0.54072,0.0
Decision Tree,0.151451,0.170899,0.160588,0.543002,0.194936,0.081428
Random Forest,0.519231,0.004312,0.008554,0.736624,0.302316,0.041643
Gradient Boosting,0.589888,0.01677,0.032614,0.76467,0.343367,0.089554


In [None]:
X_submit = test_df.drop(columns='W_DEFAULT')
X_submit = preproc.transform(X_submit)
X_submit[np.isnan(X_submit)] = 0

y_submit_rf = rf.predict_proba(X_submit)
y_submit_gbm = gbm.predict_proba(X_submit)

pd.Series(y_submit_rf[:,1], index=test_df.SK_ID_CURR, name='TARGET').to_csv('submits/baseline_rf.csv')
pd.Series(y_submit_gbm[:,1], index=test_df.SK_ID_CURR, name='TARGET').to_csv('submits/baseline_gbm.csv')

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

models = {
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10_000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    # 'Gradient Boosting': GradientBoostingClassifier(), # remove for performance
    'XGBoost': XGBClassifier() # XGBoost is much faster than sklearn GBM (order of magnitude)
}

In [24]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

seed = None
ros = RandomOverSampler(random_state=seed)
rus = RandomUnderSampler(random_state=seed)
smote = SMOTE(random_state=seed)
adasyn = ADASYN(random_state=seed)
tomek = TomekLinks()
smotetomek = SMOTETomek(random_state=seed)

methods = {
    'Original': 'passthrough',
    'Random Oversampling': ros,
    'Random Undersampling': rus,
    'SMOTE': smote,
    'ADASYN': adasyn,
    # 'Tomek': tomek, #remove for performance
    # 'SMOTETomek': smotetomek,
}

In [22]:
scoring = {
    'auc_prc': auc_prc_score, 
    'precision': make_scorer(precision_score), 
    'recall': make_scorer(recall_score),
    'mcc': make_scorer(matthews_corrcoef),
    'auc_roc': make_scorer(roc_auc_score, needs_proba=True)
}

In [25]:
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline

all_results = []
for model_name, model in models.items():
    results_model = {}

    for method, resampler in methods.items():

        pipeline = Pipeline(
            [
                (method, resampler),
                [model_name, model]
            ]
        )

        scores = cross_validate(pipeline, X, y, scoring=scoring)
        scores = {key: np.mean(values) for key, values in scores.items()}

        results_model[method] = {
            'Precision': scores['test_precision'],
            'Recall': scores['test_recall'],
            'AUC PRC': scores['test_auc_prc'],
            'AUC ROC': scores['test_auc_roc'],
            'MCC': scores['test_mcc'],
            'Training Time': scores['fit_time']
        }
    
    results_model = pd.DataFrame.from_dict(results_model, orient='index')
    print(f'Results for {model_name}')
    display(results_model)

    results_model['Model'] = model_name
    all_results.append(results_model.reset_index(names='Sampler'))

all_results = pd.concat(all_results, axis=0).reset_index(drop=True)

Results for kNN


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.04612,0.333333,0.22193,0.490019,-0.011218,0.053993
Random Oversampling,0.079295,0.380101,0.249585,0.493698,-0.008283,0.471803
Random Undersampling,0.084973,0.419335,0.235926,0.498282,-0.000198,0.083617
SMOTE,0.083843,0.372971,0.246762,0.4931,-0.008723,1.34134
ADASYN,0.078788,0.373253,0.245716,0.492939,-0.00788,7.678382


Results for Logistic Regression


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.0,0.0,0.089669,0.512434,0.0,0.707048
Random Oversampling,0.089691,0.47283,0.087455,0.534175,0.029252,1.463308
Random Undersampling,0.092932,0.248781,0.086116,0.526285,0.023421,0.129849
SMOTE,0.091064,0.563907,0.089036,0.544281,0.036962,2.528805
ADASYN,0.091113,0.140947,0.085675,0.522368,0.01521,7.813154


Results for Decision Tree


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.139976,0.19142,0.198336,0.542536,0.074752,9.140665
Random Oversampling,0.134511,0.142115,0.172941,0.531446,0.060793,12.744049
Random Undersampling,0.109055,0.577563,0.36036,0.581026,0.089408,1.09283
SMOTE,0.139529,0.195327,0.199908,0.543791,0.075803,13.734627
ADASYN,0.139937,0.198751,0.201686,0.54478,0.077046,19.607492


Results for Random Forest


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.619632,0.004512,0.222534,0.733944,0.047892,12.100175
Random Oversampling,0.526252,0.017402,0.219145,0.739105,0.063742,20.798779
Random Undersampling,0.164316,0.667553,0.216818,0.746912,0.214242,1.442647
SMOTE,0.480872,0.003464,0.208329,0.727922,0.035432,24.195499
ADASYN,0.531311,0.00427,0.20894,0.728403,0.041978,30.024426


Results for XGBoost


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.341426,0.055911,0.227935,0.747005,0.112111,1.635812
Random Oversampling,0.191187,0.529184,0.217998,0.738877,0.195324,2.713307
Random Undersampling,0.169427,0.553998,0.2038,0.729484,0.186509,0.471406
SMOTE,0.40973,0.055992,0.219337,0.74029,0.117921,5.351224
ADASYN,0.431327,0.050876,0.224434,0.747781,0.114264,11.944927
