In [1]:
import v11_common as com
import numpy as np
import pandas as pd
import xgboost as xgb
import feather
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_feather("v11 train data.feather")
test = pd.read_feather("v11 test data.feather")
target_df = pd.read_feather("target.feather")

In [3]:
train_IDs = train["SK_ID_CURR"]
test_IDs = test["SK_ID_CURR"]

train.drop(["SK_ID_CURR"], axis = 1, inplace = True)
test.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [4]:
xgboost_params = {"n_estimators":800,
                  "learning_rate":0.01,      #previous: 0.01
                  "colsample_bytree":0.932, #previous: 0.9315 (opt)
                  "subsample":0.6124,         #previous: 0.771, 0.6124 (opt)
                  "max_depth":4,             #previous: 7, 4 (opt)
                  "reg_alpha":0.0586,        #previous: 0.15, 0.0586 (opt)
                  "reg_lambda":0.2184,       #previous: 0.091, 0.2184 (opt)
                  "min_child_weight":2}      #previous: 2, 49 (opt)

In [None]:
#estimated AUC: 0.775676

folds = KFold(n_splits = 4, shuffle = True)
auc_scores = []

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_val = train.iloc[val_ids]
    target_train = target_df["TARGET"][train_ids]
    target_val = target_df["TARGET"][val_ids]
    
    clf = xgb.XGBClassifier(**xgboost_params)
    clf.fit(train_train, target_train)
    
    train_predictions = clf.predict_proba(train_train)
    val_predictions = clf.predict_proba(train_val)
    auc_train = roc_auc_score(target_train, train_predictions[:,1])
    auc_val = roc_auc_score(target_val, val_predictions[:,1])
    print(f"Train ROC: {auc_train}\nTest ROC: {auc_val}")
    
    auc_scores.append(auc_val)

print(f"\n\nAverage AUC: {np.mean(auc_scores)}")

In [6]:
with com.timer("Training"):
    clf2 = xgb.XGBClassifier(**xgboost_params)

with com.timer("Fitting"):
    clf2.fit(train, target_df["TARGET"])

Training -- done in 1.5020370483398438e-05 sec
Fitting -- done in 4337.252877950668 sec


In [7]:
train_predictions = clf2.predict_proba(train)
data_predictions = clf2.predict_proba(test)

In [8]:
train_preds = pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":train_predictions[:,1]})
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})

train_preds.to_csv("v11_predictions_XGBoost_train.csv", index = False)
submission.to_csv("v11_predictions_XGBoost.csv", index = False)