In [None]:
# Solution attempt 11 - AdaBoostClassifier
# Final submission: 
# Submission score: 0.731

# This is a quick attempt to employ the AdaBoost Classifier as part of the
# multi-model solution.

In [1]:
import v11_common as com
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
import feather
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
# Currently using the train and test data supplied in the LGBM notebook
train = pd.read_feather("v11 train data.feather")
test = pd.read_feather("v11 test data.feather")
target_df = pd.read_feather("target.feather")

In [3]:
all_data = pd.concat([train,test], ignore_index = True)

In [12]:
bureau_fill = ["MAX_DAYS_OVERDUE","AVG_DAYS_OVERDUE","TOTAL_CURRENT_CREDIT_AMT","TOTAL_CURRENT_CREDIT_DEBT",
               "TOTAL_CURRENT_CREDIT_LIMIT", "TOTAL_BUREAU_BALANCE_ENTIRES","AVG_NUM_BUREAU_BALANCE_ENTRIES"]
prev_app_fill = ["TOTAL_AMT_APPLICATION","TOTAL_CREDIT_RECEIVED","NUMBER_APPLICATIONS",
                 "NUM_CONTRACT_TYPE_Cash loans","NUM_CONTRACT_TYPE_Consumer loans","NUM_CONTRACT_TYPE_Revolving loans"]
cc_fill = ["NUM_PREV_CC_LOANS","TOTAL_NUM_ALL_DRAWINGS","TOTAL_AMT_ALL_DRAWINGS"]
inst_pay_fill = ["NUM_INSTALLMENT_ENTRIES"]
pos_fill = ["MAX_POS_DPD","AVG_POS_DPD","NUM_LATE_POS_PAYMENTS","NUM_POS_ENTRIES","NUM_CONTRACTS_COMPLETED",
            "NUM_INSTALMENTS_PENDING","NUM_ACCOUNTS_ACTIVE"]

In [13]:
fill_cols = bureau_fill + prev_app_fill + cc_fill + inst_pay_fill + pos_fill

In [14]:
all_data[fill_cols] = all_data[fill_cols].fillna(0).copy()

In [15]:
all_data.dropna(axis = 1, inplace = True)

In [16]:
all_data.shape

(356255, 147)

In [25]:
train = all_data.iloc[:len(target_df),:].copy()
test = all_data.iloc[len(target_df):,:].copy()

In [26]:
train_IDs = train["SK_ID_CURR"]
test_IDs = test["SK_ID_CURR"]

train.drop(["SK_ID_CURR"], axis = 1, inplace = True)
test.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [19]:
adaboost_params = {"n_estimators":100,
                   "learning_rate":0.6}

In [20]:
# Prev AUC: 0.7408
# Best AUC: 0.7408

folds = KFold(n_splits = 5, shuffle = True)
auc_scores = []

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_val = train.iloc[val_ids]
    target_train = target_df["TARGET"][train_ids]
    target_val = target_df["TARGET"][val_ids]
    
    clf = AdaBoostClassifier(**adaboost_params)
    clf.fit(train_train, target_train)
    
    train_predictions = clf.predict_proba(train_train)
    val_predictions = clf.predict_proba(train_val)
    auc_train = roc_auc_score(target_train, train_predictions[:,1])
    auc_val = roc_auc_score(target_val, val_predictions[:,1])
    print(f"Train ROC: {auc_train}\nTest ROC: {auc_val}")
    
    auc_scores.append(auc_val)

print(f"\n\nAverage AUC: {np.mean(auc_scores)}")

#### FOLD NUMBER 1 ####
Train ROC: 0.7460362214675162
Test ROC: 0.7395681728400474
#### FOLD NUMBER 2 ####
Train ROC: 0.7453317794997776
Test ROC: 0.7416183736018496
#### FOLD NUMBER 3 ####
Train ROC: 0.7467141489894047
Test ROC: 0.7365973278002195
#### FOLD NUMBER 4 ####
Train ROC: 0.7448985245820576
Test ROC: 0.7430695589495027
#### FOLD NUMBER 5 ####
Train ROC: 0.7451075642948224
Test ROC: 0.743306964394417


Average AUC: 0.7408320795172072


In [21]:
clf2 = AdaBoostClassifier(**adaboost_params)
clf2.fit(train, target_df["TARGET"])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.6, n_estimators=100, random_state=None)

In [22]:
train_predictions = clf2.predict_proba(train)
data_predictions = clf2.predict_proba(test)

In [27]:
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":np.reshape(data_predictions[:,1], newshape = (-1))})
train_preds = pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":np.reshape(train_predictions[:,1], newshape = (-1))})

In [28]:
submission.to_csv("v11_predictions_AdaBoost.csv", index = False)
train_preds.to_csv("v11_predictions_AdaBoost_train.csv", index = False)