In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [2]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1)

In [3]:
random_state = 101
np.random.seed(random_state)
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')

In [4]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [5]:
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    "bagging_seed" : random_state,
    "verbosity" : 1,
    "seed": random_state
}

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = df_train[['ID_code', 'target']]
oof['predict'] = 0
predictions = df_test[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

In [7]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
X_test = df_test[features].values

In [8]:
%%time
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 5
    p_valid,yp = 0,0
    for i in range(N):
        X_t, y_t = augment(X_train.values, y_train.values)
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        100000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp/N

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.891017	valid_1's auc: 0.883438
[2000]	training's auc: 0.901013	valid_1's auc: 0.891943
[3000]	training's auc: 0.907665	valid_1's auc: 0.896348
[4000]	training's auc: 0.912381	valid_1's auc: 0.899258
[5000]	training's auc: 0.916078	valid_1's auc: 0.90117
[6000]	training's auc: 0.919082	valid_1's auc: 0.90222
[7000]	training's auc: 0.921726	valid_1's auc: 0.902892
[8000]	training's auc: 0.924129	valid_1's auc: 0.903278
[9000]	training's auc: 0.926324	valid_1's auc: 0.903397
[10000]	training's auc: 0.928472	valid_1's auc: 0.90349
[11000]	training's auc: 0.930566	valid_1's auc: 0.903569
[12000]	training's auc: 0.932631	valid_1's auc: 0.903473
[13000]	training's auc: 0.934661	valid_1's auc: 0.903463
Early stopping, best iteration is:
[10913]	training's auc: 0.930393	valid_1's auc: 0.903591
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.89122	valid_1's auc: 0.883

[9000]	training's auc: 0.927263	valid_1's auc: 0.90023
[10000]	training's auc: 0.929437	valid_1's auc: 0.900475
[11000]	training's auc: 0.931506	valid_1's auc: 0.900541
[12000]	training's auc: 0.93356	valid_1's auc: 0.900527
[13000]	training's auc: 0.935538	valid_1's auc: 0.900542
[14000]	training's auc: 0.937519	valid_1's auc: 0.900595
[15000]	training's auc: 0.939448	valid_1's auc: 0.900636
[16000]	training's auc: 0.941361	valid_1's auc: 0.900492
[17000]	training's auc: 0.943193	valid_1's auc: 0.900401
[18000]	training's auc: 0.944999	valid_1's auc: 0.900281
Early stopping, best iteration is:
[15024]	training's auc: 0.939492	valid_1's auc: 0.900648
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.892318	valid_1's auc: 0.878885
[2000]	training's auc: 0.902146	valid_1's auc: 0.887245
[3000]	training's auc: 0.908448	valid_1's auc: 0.891886
[4000]	training's auc: 0.913091	valid_1's auc: 0.895002
[5000]	training's auc: 0.916758	valid_1's auc: 0.8969

[7000]	training's auc: 0.921516	valid_1's auc: 0.903624
[8000]	training's auc: 0.923898	valid_1's auc: 0.904062
[9000]	training's auc: 0.926136	valid_1's auc: 0.904303
[10000]	training's auc: 0.928315	valid_1's auc: 0.904504
[11000]	training's auc: 0.930435	valid_1's auc: 0.904603
[12000]	training's auc: 0.932514	valid_1's auc: 0.904461
[13000]	training's auc: 0.934552	valid_1's auc: 0.904351
Early stopping, best iteration is:
[10871]	training's auc: 0.930156	valid_1's auc: 0.904613
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.891379	valid_1's auc: 0.883739
[2000]	training's auc: 0.901307	valid_1's auc: 0.891989
[3000]	training's auc: 0.907742	valid_1's auc: 0.896705
[4000]	training's auc: 0.912455	valid_1's auc: 0.89979
[5000]	training's auc: 0.916085	valid_1's auc: 0.901814
[6000]	training's auc: 0.919071	valid_1's auc: 0.902869
[7000]	training's auc: 0.921632	valid_1's auc: 0.903408
[8000]	training's auc: 0.923996	valid_1's auc: 0.903836
[

In [9]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

Mean auc: 0.901621900, std: 0.002838427. All auc: 0.901584160.


In [None]:
# submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('lgb_all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("lgb_submission.csv", index=False)
oof.to_csv('lgb_oof.csv', index=False)