# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [8]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

In [3]:
pd.set_option('display.max_columns', 400)

# 데이터 로드

In [5]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train.shape, test.shape

((200000, 202), (200000, 201))

In [9]:
train, test_cv = train_test_split(train, test_size=0.1)
train.shape, test.shape

((180000, 202), (200000, 201))

In [10]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

# LGB

In [11]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [12]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 60000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=3000, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
    
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[3000]	training's auc: 0.924493	valid_1's auc: 0.894003
[6000]	training's auc: 0.93943	valid_1's auc: 0.896886
Early stopping, best iteration is:
[6093]	training's auc: 0.93984	valid_1's auc: 0.896934
CV score: 0.89693 
fold n°1
Training until validation scores don't improve for 200 rounds.
[3000]	training's auc: 0.923886	valid_1's auc: 0.893228
[6000]	training's auc: 0.93905	valid_1's auc: 0.897046
Early stopping, best iteration is:
[6044]	training's auc: 0.939254	valid_1's auc: 0.897081
CV score: 0.89708 
fold n°2
Training until validation scores don't improve for 200 rounds.
[3000]	training's auc: 0.92265	valid_1's auc: 0.901253
[6000]	training's auc: 0.938138	valid_1's auc: 0.904668
Early stopping, best iteration is:
[6182]	training's auc: 0.938918	valid_1's auc: 0.904716
CV score: 0.90472 
fold n°3
Training until validation scores don't improve for 200 rounds.
[3000]	training's auc: 0.922817	valid_1's auc: 0.9

In [13]:
temp = train.target.to_frame()
temp['predict'] = oof_lgb
temp = temp.loc[(temp.predict <= 0.1)].reset_index()
temp['new_target']  = 0
temp.loc[temp.target == 1, 'new_target'] = 1

In [14]:
train = temp[['index', 'new_target']].merge(train.reset_index(), on='index', how='left').drop(['index', 'ID_code', 'target'], axis=1)

In [15]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'binary_logloss',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [16]:
target = train['new_target']

In [17]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
# predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target', 'new_target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 60000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=3000, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
#     predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
    
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[3000]	training's binary_logloss: 0.0815627	valid_1's binary_logloss: 0.104835
Early stopping, best iteration is:
[4049]	training's binary_logloss: 0.0737085	valid_1's binary_logloss: 0.104428
CV score: 0.67189 
fold n°1
Training until validation scores don't improve for 200 rounds.
[3000]	training's binary_logloss: 0.0816445	valid_1's binary_logloss: 0.105261
Early stopping, best iteration is:
[4190]	training's binary_logloss: 0.0727463	valid_1's binary_logloss: 0.10483
CV score: 0.66393 
fold n°2
Training until validation scores don't improve for 200 rounds.
[3000]	training's binary_logloss: 0.0816934	valid_1's binary_logloss: 0.104573
Early stopping, best iteration is:
[5786]	training's binary_logloss: 0.0628464	valid_1's binary_logloss: 0.103502
CV score: 0.69158 
fold n°3
Training until validation scores don't improve for 200 rounds.
[3000]	training's binary_logloss: 0.0816808	valid_1's binary_logloss: 0.10475

In [21]:
temp = train.new_target.to_frame()

In [22]:
temp['pred'] = oof_lgb

In [25]:
temp[temp.new_target == 1]

Unnamed: 0,new_target,pred
11,1,0.083312
14,1,0.056873
43,1,0.035674
97,1,0.034539
107,1,0.018266
145,1,0.027268
154,1,0.031506
166,1,0.030995
190,1,0.008879
229,1,0.012091


In [18]:
oof_lgb

array([0.01285009, 0.00725737, 0.04558558, ..., 0.02580201, 0.01274669,
       0.01937167])