# Module

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")
gc.enable()

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

# Load Data

In [3]:
train_raw = pd.read_csv('./data/train.csv')
test_raw = pd.read_csv('./data/test.csv')
train_raw.shape, test_raw.shape

((200000, 202), (200000, 201))

In [5]:
train = train_raw.copy()
test = test_raw.copy()

In [6]:
col_list = train.columns[2:]

In [7]:
train_0 = train[train.target == 0]
train_1 = train[train.target == 1]

In [8]:
pb_idx = np.load('./data_temp/public_LB.npy')
pv_idx = np.load('./data_temp/private_LB.npy')

In [9]:
test_pb = test.iloc[pb_idx].sort_index().copy()
test_pv = test.iloc[pv_idx].sort_index().copy()

test_real = test_pb.append(test_pv)

In [10]:
data = train.append(test_real)

In [11]:
oof = pd.read_csv('./data_temp/new_bbiggu.csv')

In [12]:
bbiggu_0 = oof[(oof.target == 0) & (oof.pred > 0.1)]
bbiggu_1 = oof[(oof.target == 1) & (oof.pred < 0.1)]
normal_0 = oof[(oof.target == 0) & (oof.pred < 0.1)]
normal_1 = oof[(oof.target == 1) & (oof.pred > 0.1)]

In [13]:
bbiggu_0 = train[train.ID_code.isin(bbiggu_0.ID_code)]
bbiggu_1 = train[train.ID_code.isin(bbiggu_1.ID_code)]
normal_0 = train[train.ID_code.isin(normal_0.ID_code)]
normal_1 = train[train.ID_code.isin(normal_1.ID_code)]

In [14]:
bbiggu_0_id_code = bbiggu_0.ID_code
bbiggu_1_id_code = bbiggu_1.ID_code
normal_0_id_code = normal_0.ID_code
normal_1_id_code = normal_1.ID_code

In [16]:
unique_df = data[['ID_code']]
con_df = data[['ID_code']]

In [17]:
for col in tqdm(col_list):
    unique_df[col] = data[col].map(((data[col].value_counts() == 1) * 1).to_dict())
    con_df[col] = data[col].map((~(data[col].value_counts() == 1) * 1).to_dict())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [18]:
for col in tqdm(col_list):
    data[col + '_unique'] = np.around(data[col] * unique_df[col], 4)
    data[col + '_con'] = np.around(data[col] * con_df[col], 4)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [19]:
for col in tqdm(col_list):
    data[col + '_con_multi_counts'] = data[col + '_con'] * data[col].map(data[col].value_counts().to_dict())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [21]:
for col in tqdm(col_list):
    data.loc[data[col + '_unique']==0, col + '_unique'] = np.nan

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [22]:
train = data[~data.target.isna()]
test = data[data.target.isna()]

In [23]:
target = train['target']

In [24]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [25]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 50000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=400, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
            
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[400]	training's auc: 0.88415	valid_1's auc: 0.868142
[800]	training's auc: 0.897615	valid_1's auc: 0.880292
[1200]	training's auc: 0.906373	valid_1's auc: 0.888298
[1600]	training's auc: 0.913601	valid_1's auc: 0.893954
[2000]	training's auc: 0.919348	valid_1's auc: 0.89799
[2400]	training's auc: 0.924386	valid_1's auc: 0.901487
[2800]	training's auc: 0.928494	valid_1's auc: 0.904025
[3200]	training's auc: 0.932245	valid_1's auc: 0.905864
[3600]	training's auc: 0.935545	valid_1's auc: 0.907672
[4000]	training's auc: 0.938574	valid_1's auc: 0.908918
[4400]	training's auc: 0.941256	valid_1's auc: 0.910272
[4800]	training's auc: 0.943797	valid_1's auc: 0.911183
[5200]	training's auc: 0.946096	valid_1's auc: 0.911927
[5600]	training's auc: 0.948267	valid_1's auc: 0.912675
[6000]	training's auc: 0.950386	valid_1's auc: 0.913173
[6400]	training's auc: 0.952388	valid_1's auc: 0.913627
[6800]	training's auc: 0.954195	vali

KeyboardInterrupt: 