# ライブラリの読み込み

In [1]:
# 警告を消す
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
import optuna
import lightgbm as lgb

# データセット

In [2]:
submit = pd.read_csv('submit_sample.csv',header=None)
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# データの確認

In [3]:
train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,services,married,secondary,no,12294,yes,no,cellular,21,nov,101,3,498,0,other,0
1,1,29,entrepreneur,single,tertiary,no,43027,no,no,cellular,22,aug,158,2,702,0,unknown,1
2,2,35,management,married,tertiary,no,12252,yes,no,cellular,11,nov,351,1,826,0,failure,0
3,3,31,technician,married,secondary,no,99121,yes,yes,unknown,16,may,658,2,120,0,failure,0
4,4,48,unemployed,married,primary,no,42005,yes,no,telephone,3,apr,177,1,273,0,unknown,0


In [4]:
# 欠損値の確認
train.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
# カテゴリー数の確認
train.nunique()

id           27100
age             42
job             11
marital          3
education        4
default          2
balance      23967
housing          2
loan             2
contact          3
day             30
month           11
duration       140
campaign         5
pdays          872
previous         4
poutcome         4
y                2
dtype: int64

In [6]:
# 特定のカテゴリーにどのくらい値があるのかを確認する。
train['poutcome'].value_counts()

unknown    23099
failure     2717
other        826
success      458
Name: poutcome, dtype: int64

# データの結合　処理

In [7]:
# データの結合（trainとtestデータそれぞれに前処理をかけるため結合しておく。）
data = pd.concat([train, test], sort = False)

In [8]:
# それぞれlightgbmで扱えるようにカテゴリを数値に変換しておく
# 数の少ない変換は以下のように直接変換を行うこともできる。
data['month'].replace(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'], [1,2,3,4,5,6,7,8,9,10,11,12], inplace=True)
data['housing'].replace(['yes','no'], [1, 0], inplace= True)
data['loan'].replace(['yes','no'], [1, 0], inplace= True)
data['default'].replace(['yes','no'], [1, 0], inplace= True)

In [9]:
# LabelEncoderによるLable化
# 機械学習モデルで扱うために文字列を数値型に変換していく。
cat_features = ['job','marital', 'education', 'contact', 'poutcome']
for col in cat_features:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].values)
    
data.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,31,7,1,1,0,12294,1,0,0,21,11,101,3,498,0,1,0.0
1,1,29,2,2,2,0,43027,0,0,0,22,8,158,2,702,0,3,1.0
2,2,35,4,1,2,0,12252,1,0,0,11,11,351,1,826,0,0,0.0
3,3,31,9,1,1,0,99121,1,1,2,16,5,658,2,120,0,0,0.0
4,4,48,10,1,0,0,42005,1,0,1,3,4,177,1,273,0,3,0.0


# 機械学習モデルに渡すためのデータを用意する。

In [10]:
train = data[:len(train)]
test = data[len(train):]

In [11]:
y_train = train['y']
X_train = train.drop(['y','id','pdays'], axis=1)
X_test = test.drop(['y','id','pdays'], axis=1)

In [12]:
# カテゴリー変数の宣言
categorical_features = ['job', 'marital', 'education','housing','loan','contact','month','poutcome']

# パラメーターチューニング

In [13]:
#  optunaを使用した自動パラメーターチューニング

# パラメーターチューニング用のデータをセット
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, free_raw_data=False)
lgb_test = lgb.Dataset(X_test)

# metric auc、objective binary 固定
params = {
    'metric' :'auc',
    "objective": "binary",
}

# 複数のパラメータの条件で検証を行い最適の（もっとも評価が高くなる）パラメータをbest_paramsとし代入する。
clf = optuna.integration.lightgbm.train(
    params, lgb_train, 
    valid_sets=lgb_test, 
    verbose_eval=10, 
    early_stopping_rounds=5,
 )

best_params = clf.params

feature_fraction, val_score: -inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  14%|#4        | 1/7 [00:00<00:01,  5.59it/s][I 2020-08-15 19:40:31,458] Trial 0 finished with value: 1.0 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  14%|#4        | 1/7 [00:00<00:01,  5.59it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  14%|#4        | 1/7 [00:00<00:01,  5.59it/s][I 2020-08-15 19:40:31,526] Trial 1 finished with value: 1.0 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  29%|##8       | 2/7 [00:00<00:00,  5.59it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  43%|####2     | 3/7 [00:00<00:00,  6.87it/s][I 2020-08-15 19:40:31,593] Trial 2 finished with value: 1.0 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  43%|####2     | 3/7 [00:00<00:00,  6.87it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  43%|####2     | 3/7 [00:00<00:00,  6.87it/s][I 2020-08-15 19:40:31,658] Trial 3 finished with value: 1.0 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  57%|#####7    | 4/7 [00:00<00:00,  6.87it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  71%|#######1  | 5/7 [00:00<00:00,  8.33it/s][I 2020-08-15 19:40:31,716] Trial 4 finished with value: 1.0 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  71%|#######1  | 5/7 [00:00<00:00,  8.33it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000:  71%|#######1  | 5/7 [00:00<00:00,  8.33it/s][I 2020-08-15 19:40:31,782] Trial 5 finished with value: 1.0 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000:  86%|########5 | 6/7 [00:00<00:00,  8.33it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction, val_score: 1.000000: 100%|##########| 7/7 [00:00<00:00,  9.92it/s][I 2020-08-15 19:40:31,832] Trial 6 finished with value: 1.0 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 1.0.
feature_fraction, val_score: 1.000000: 100%|##########| 7/7 [00:00<00:00, 12.53it/s]
num_leaves, val_score: 1.000000:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:   0%|          | 0/20 [00:00<?, ?it/s][I 2020-08-15 19:40:31,918] Trial 7 finished with value: 1.0 and parameters: {'num_leaves': 128}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:   5%|5         | 1/20 [00:00<00:01, 12.38it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:01, 14.37it/s][I 2020-08-15 19:40:31,982] Trial 8 finished with value: 1.0 and parameters: {'num_leaves': 145}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:01, 14.37it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:01, 14.37it/s][I 2020-08-15 19:40:32,069] Trial 9 finished with value: 1.0 and parameters: {'num_leaves': 205}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  15%|#5        | 3/20 [00:00<00:01, 14.37it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:01, 14.02it/s][I 2020-08-15 19:40:32,143] Trial 10 finished with value: 1.0 and parameters: {'num_leaves': 8}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:01, 14.02it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:01, 14.02it/s][I 2020-08-15 19:40:32,208] Trial 11 finished with value: 1.0 and parameters: {'num_leaves': 19}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  25%|##5       | 5/20 [00:00<00:01, 14.02it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:01, 11.20it/s][I 2020-08-15 19:40:32,396] Trial 12 finished with value: 1.0 and parameters: {'num_leaves': 245}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:01, 11.20it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:01, 11.20it/s][I 2020-08-15 19:40:32,463] Trial 13 finished with value: 1.0 and parameters: {'num_leaves': 82}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  35%|###5      | 7/20 [00:00<00:01, 11.20it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  40%|####      | 8/20 [00:00<00:01, 11.67it/s][I 2020-08-15 19:40:32,556] Trial 14 finished with value: 1.0 and parameters: {'num_leaves': 58}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  40%|####      | 8/20 [00:00<00:01, 11.67it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:01,  9.87it/s][I 2020-08-15 19:40:32,688] Trial 15 finished with value: 1.0 and parameters: {'num_leaves': 252}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:01,  9.87it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:01,  9.87it/s][I 2020-08-15 19:40:32,772] Trial 16 finished with value: 1.0 and parameters: {'num_leaves': 178}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  50%|#####     | 10/20 [00:00<00:01,  9.87it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  55%|#####5    | 11/20 [00:01<00:00, 10.52it/s][I 2020-08-15 19:40:32,852] Trial 17 finished with value: 1.0 and parameters: {'num_leaves': 59}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  55%|#####5    | 11/20 [00:01<00:00, 10.52it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  60%|######    | 12/20 [00:01<00:01,  7.64it/s][I 2020-08-15 19:40:33,063] Trial 18 finished with value: 1.0 and parameters: {'num_leaves': 252}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  60%|######    | 12/20 [00:01<00:01,  7.64it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  65%|######5   | 13/20 [00:01<00:01,  5.04it/s][I 2020-08-15 19:40:33,421] Trial 19 finished with value: 1.0 and parameters: {'num_leaves': 185}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  65%|######5   | 13/20 [00:01<00:01,  5.04it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  65%|######5   | 13/20 [00:01<00:01,  5.04it/s][I 2020-08-15 19:40:33,495] Trial 20 finished with value: 1.0 and parameters: {'num_leaves': 75}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  70%|#######   | 14/20 [00:01<00:01,  5.04it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  75%|#######5  | 15/20 [00:01<00:00,  6.03it/s][I 2020-08-15 19:40:33,599] Trial 21 finished with value: 1.0 and parameters: {'num_leaves': 116}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  75%|#######5  | 15/20 [00:01<00:00,  6.03it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  80%|########  | 16/20 [00:01<00:00,  5.75it/s][I 2020-08-15 19:40:33,791] Trial 22 finished with value: 1.0 and parameters: {'num_leaves': 209}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  80%|########  | 16/20 [00:01<00:00,  5.75it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  80%|########  | 16/20 [00:02<00:00,  5.75it/s][I 2020-08-15 19:40:33,857] Trial 23 finished with value: 1.0 and parameters: {'num_leaves': 162}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  85%|########5 | 17/20 [00:02<00:00,  5.75it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  90%|######### | 18/20 [00:02<00:00,  7.10it/s][I 2020-08-15 19:40:33,919] Trial 24 finished with value: 1.0 and parameters: {'num_leaves': 103}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  90%|######### | 18/20 [00:02<00:00,  7.10it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000:  90%|######### | 18/20 [00:02<00:00,  7.10it/s][I 2020-08-15 19:40:34,005] Trial 25 finished with value: 1.0 and parameters: {'num_leaves': 220}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000:  95%|#########5| 19/20 [00:02<00:00,  7.10it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


num_leaves, val_score: 1.000000: 100%|##########| 20/20 [00:02<00:00,  8.14it/s][I 2020-08-15 19:40:34,080] Trial 26 finished with value: 1.0 and parameters: {'num_leaves': 156}. Best is trial 7 with value: 1.0.
num_leaves, val_score: 1.000000: 100%|##########| 20/20 [00:02<00:00,  8.91it/s]
bagging, val_score: 1.000000:   0%|          | 0/10 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:   0%|          | 0/10 [00:00<?, ?it/s][I 2020-08-15 19:40:34,148] Trial 27 finished with value: 1.0 and parameters: {'bagging_fraction': 0.9516299320888825, 'bagging_freq': 6}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  10%|#         | 1/10 [00:00<00:00, 16.73it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  20%|##        | 2/10 [00:00<00:01,  4.78it/s][I 2020-08-15 19:40:34,512] Trial 28 finished with value: 1.0 and parameters: {'bagging_fraction': 0.4016182708683118, 'bagging_freq': 1}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  20%|##        | 2/10 [00:00<00:01,  4.78it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  20%|##        | 2/10 [00:00<00:01,  4.78it/s][I 2020-08-15 19:40:34,584] Trial 29 finished with value: 1.0 and parameters: {'bagging_fraction': 0.4807605593665921, 'bagging_freq': 1}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  30%|###       | 3/10 [00:00<00:01,  4.78it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  40%|####      | 4/10 [00:00<00:00,  6.07it/s][I 2020-08-15 19:40:34,633] Trial 30 finished with value: 1.0 and parameters: {'bagging_fraction': 0.982463740708008, 'bagging_freq': 7}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  40%|####      | 4/10 [00:00<00:00,  6.07it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  40%|####      | 4/10 [00:00<00:00,  6.07it/s][I 2020-08-15 19:40:34,711] Trial 31 finished with value: 1.0 and parameters: {'bagging_fraction': 0.9343088101425634, 'bagging_freq': 6}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  50%|#####     | 5/10 [00:00<00:00,  6.07it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  60%|######    | 6/10 [00:00<00:00,  7.27it/s][I 2020-08-15 19:40:34,781] Trial 32 finished with value: 1.0 and parameters: {'bagging_fraction': 0.4135703500080807, 'bagging_freq': 1}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  60%|######    | 6/10 [00:00<00:00,  7.27it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  60%|######    | 6/10 [00:00<00:00,  7.27it/s][I 2020-08-15 19:40:34,826] Trial 33 finished with value: 1.0 and parameters: {'bagging_fraction': 0.40290203263445945, 'bagging_freq': 1}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  70%|#######   | 7/10 [00:00<00:00,  7.27it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  70%|#######   | 7/10 [00:00<00:00,  7.27it/s][I 2020-08-15 19:40:34,879] Trial 34 finished with value: 1.0 and parameters: {'bagging_fraction': 0.9496791850277148, 'bagging_freq': 7}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  80%|########  | 8/10 [00:00<00:00,  7.27it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  90%|######### | 9/10 [00:00<00:00,  9.03it/s][I 2020-08-15 19:40:34,926] Trial 35 finished with value: 1.0 and parameters: {'bagging_fraction': 0.7091950317649913, 'bagging_freq': 3}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000:  90%|######### | 9/10 [00:00<00:00,  9.03it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


bagging, val_score: 1.000000:  90%|######### | 9/10 [00:00<00:00,  9.03it/s][I 2020-08-15 19:40:34,983] Trial 36 finished with value: 1.0 and parameters: {'bagging_fraction': 0.7458395843257934, 'bagging_freq': 3}. Best is trial 27 with value: 1.0.
bagging, val_score: 1.000000: 100%|##########| 10/10 [00:00<00:00, 11.17it/s]
feature_fraction_stage2, val_score: 1.000000:   0%|          | 0/3 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction_stage2, val_score: 1.000000:   0%|          | 0/3 [00:00<?, ?it/s][I 2020-08-15 19:40:35,031] Trial 37 finished with value: 1.0 and parameters: {'feature_fraction': 0.92}. Best is trial 37 with value: 1.0.
feature_fraction_stage2, val_score: 1.000000:  33%|###3      | 1/3 [00:00<00:00, 20.37it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction_stage2, val_score: 1.000000:  33%|###3      | 1/3 [00:00<00:00, 12.61it/s][I 2020-08-15 19:40:35,071] Trial 38 finished with value: 1.0 and parameters: {'feature_fraction': 0.9840000000000001}. Best is trial 37 with value: 1.0.
feature_fraction_stage2, val_score: 1.000000:  67%|######6   | 2/3 [00:00<00:00, 22.25it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


feature_fraction_stage2, val_score: 1.000000: 100%|##########| 3/3 [00:00<00:00, 24.76it/s][I 2020-08-15 19:40:35,113] Trial 39 finished with value: 1.0 and parameters: {'feature_fraction': 0.9520000000000001}. Best is trial 37 with value: 1.0.
feature_fraction_stage2, val_score: 1.000000: 100%|##########| 3/3 [00:00<00:00, 23.39it/s]
regularization_factors, val_score: 1.000000:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:   0%|          | 0/20 [00:00<?, ?it/s][I 2020-08-15 19:40:35,167] Trial 40 finished with value: 1.0 and parameters: {'lambda_l1': 0.0033052343242539225, 'lambda_l2': 0.16992798732794764}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:   5%|5         | 1/20 [00:00<00:00, 20.75it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:00, 19.16it/s][I 2020-08-15 19:40:35,230] Trial 41 finished with value: 1.0 and parameters: {'lambda_l1': 2.0556908585728797e-07, 'lambda_l2': 8.664119547615005e-08}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:00, 19.16it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  10%|#         | 2/20 [00:00<00:00, 19.16it/s][I 2020-08-15 19:40:35,288] Trial 42 finished with value: 1.0 and parameters: {'lambda_l1': 3.7962196209741808, 'lambda_l2': 3.54759118972974e-08}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  15%|#5        | 3/20 [00:00<00:00, 19.16it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:00, 17.96it/s][I 2020-08-15 19:40:35,357] Trial 43 finished with value: 1.0 and parameters: {'lambda_l1': 5.15819991786514e-08, 'lambda_l2': 4.022630525162832}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:00, 17.96it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  20%|##        | 4/20 [00:00<00:00, 17.96it/s][I 2020-08-15 19:40:35,424] Trial 44 finished with value: 1.0 and parameters: {'lambda_l1': 4.479417747059165, 'lambda_l2': 0.00020317045562353157}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  25%|##5       | 5/20 [00:00<00:00, 17.96it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:00, 15.45it/s][I 2020-08-15 19:40:35,531] Trial 45 finished with value: 1.0 and parameters: {'lambda_l1': 0.00020708937237439654, 'lambda_l2': 0.0001464765268199814}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:00, 15.45it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  30%|###       | 6/20 [00:00<00:00, 15.45it/s][I 2020-08-15 19:40:35,571] Trial 46 finished with value: 1.0 and parameters: {'lambda_l1': 0.00023569369340554865, 'lambda_l2': 0.039898246044120726}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  35%|###5      | 7/20 [00:00<00:00, 15.45it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  35%|###5      | 7/20 [00:00<00:00, 15.45it/s][I 2020-08-15 19:40:35,612] Trial 47 finished with value: 1.0 and parameters: {'lambda_l1': 0.03883752116417805, 'lambda_l2': 1.0166241129425852e-06}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  40%|####      | 8/20 [00:00<00:00, 15.45it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:00, 17.00it/s][I 2020-08-15 19:40:35,664] Trial 48 finished with value: 1.0 and parameters: {'lambda_l1': 2.352497596777851e-06, 'lambda_l2': 1.457907071278143e-05}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:00, 17.00it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  45%|####5     | 9/20 [00:00<00:00, 17.00it/s][I 2020-08-15 19:40:35,703] Trial 49 finished with value: 1.0 and parameters: {'lambda_l1': 6.149289714040382e-06, 'lambda_l2': 0.0158418331617829}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  50%|#####     | 10/20 [00:00<00:00, 17.00it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  50%|#####     | 10/20 [00:00<00:00, 17.00it/s][I 2020-08-15 19:40:35,755] Trial 50 finished with value: 1.0 and parameters: {'lambda_l1': 0.05494550533652858, 'lambda_l2': 3.280373945232189}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  55%|#####5    | 11/20 [00:00<00:00, 17.00it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  60%|######    | 12/20 [00:00<00:00, 17.33it/s][I 2020-08-15 19:40:35,829] Trial 51 finished with value: 1.0 and parameters: {'lambda_l1': 0.20605731329891008, 'lambda_l2': 1.1292378977687583e-08}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  60%|######    | 12/20 [00:00<00:00, 17.33it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  60%|######    | 12/20 [00:00<00:00, 17.33it/s][I 2020-08-15 19:40:35,927] Trial 52 finished with value: 1.0 and parameters: {'lambda_l1': 1.2233339186174718e-05, 'lambda_l2': 0.00250629837473373}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  65%|######5   | 13/20 [00:00<00:00, 17.33it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  70%|#######   | 14/20 [00:00<00:00, 15.63it/s][I 2020-08-15 19:40:35,987] Trial 53 finished with value: 1.0 and parameters: {'lambda_l1': 1.1390953782060565e-08, 'lambda_l2': 1.926745146477861e-06}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  70%|#######   | 14/20 [00:00<00:00, 15.63it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  70%|#######   | 14/20 [00:00<00:00, 15.63it/s][I 2020-08-15 19:40:36,046] Trial 54 finished with value: 1.0 and parameters: {'lambda_l1': 0.0027577163384101423, 'lambda_l2': 0.5269863292396315}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  75%|#######5  | 15/20 [00:00<00:00, 15.63it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  80%|########  | 16/20 [00:01<00:00, 13.95it/s][I 2020-08-15 19:40:36,170] Trial 55 finished with value: 1.0 and parameters: {'lambda_l1': 9.691828899493723, 'lambda_l2': 0.0016157790458219599}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  80%|########  | 16/20 [00:01<00:00, 13.95it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  80%|########  | 16/20 [00:01<00:00, 13.95it/s][I 2020-08-15 19:40:36,289] Trial 56 finished with value: 1.0 and parameters: {'lambda_l1': 3.841921137265148e-07, 'lambda_l2': 7.651157813390038e-06}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  85%|########5 | 17/20 [00:01<00:00, 13.95it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  90%|######### | 18/20 [00:01<00:00, 11.78it/s][I 2020-08-15 19:40:36,401] Trial 57 finished with value: 1.0 and parameters: {'lambda_l1': 4.050737398311968e-05, 'lambda_l2': 2.184976715118033e-07}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  90%|######### | 18/20 [00:01<00:00, 11.78it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000:  90%|######### | 18/20 [00:01<00:00, 11.78it/s][I 2020-08-15 19:40:36,519] Trial 58 finished with value: 1.0 and parameters: {'lambda_l1': 0.3779535426187674, 'lambda_l2': 1.4491606959945699e-05}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000:  95%|#########5| 19/20 [00:01<00:00, 11.78it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


regularization_factors, val_score: 1.000000: 100%|##########| 20/20 [00:01<00:00, 10.74it/s][I 2020-08-15 19:40:36,623] Trial 59 finished with value: 1.0 and parameters: {'lambda_l1': 0.003051806456530015, 'lambda_l2': 0.0021029490515716673}. Best is trial 40 with value: 1.0.
regularization_factors, val_score: 1.000000: 100%|##########| 20/20 [00:01<00:00, 13.28it/s]
min_data_in_leaf, val_score: 1.000000:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


min_data_in_leaf, val_score: 1.000000:  20%|##        | 1/5 [00:00<00:00,  5.39it/s][I 2020-08-15 19:40:36,828] Trial 60 finished with value: 1.0 and parameters: {'min_child_samples': 100}. Best is trial 60 with value: 1.0.
min_data_in_leaf, val_score: 1.000000:  20%|##        | 1/5 [00:00<00:00,  5.39it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


min_data_in_leaf, val_score: 1.000000:  40%|####      | 2/5 [00:00<00:00,  3.89it/s][I 2020-08-15 19:40:37,249] Trial 61 finished with value: 1.0 and parameters: {'min_child_samples': 50}. Best is trial 60 with value: 1.0.
min_data_in_leaf, val_score: 1.000000:  40%|####      | 2/5 [00:00<00:00,  3.89it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


min_data_in_leaf, val_score: 1.000000:  60%|######    | 3/5 [00:00<00:00,  4.27it/s][I 2020-08-15 19:40:37,437] Trial 62 finished with value: 1.0 and parameters: {'min_child_samples': 5}. Best is trial 60 with value: 1.0.
min_data_in_leaf, val_score: 1.000000:  60%|######    | 3/5 [00:00<00:00,  4.27it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


min_data_in_leaf, val_score: 1.000000:  80%|########  | 4/5 [00:00<00:00,  4.42it/s][I 2020-08-15 19:40:37,638] Trial 63 finished with value: 1.0 and parameters: {'min_child_samples': 25}. Best is trial 60 with value: 1.0.
min_data_in_leaf, val_score: 1.000000:  80%|########  | 4/5 [00:01<00:00,  4.42it/s]

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1


min_data_in_leaf, val_score: 1.000000:  80%|########  | 4/5 [00:01<00:00,  4.42it/s][I 2020-08-15 19:40:37,732] Trial 64 finished with value: 1.0 and parameters: {'min_child_samples': 10}. Best is trial 60 with value: 1.0.
min_data_in_leaf, val_score: 1.000000: 100%|##########| 5/5 [00:01<00:00,  4.55it/s]


In [14]:
best_params

{'metric': 'auc',
 'objective': 'binary',
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'num_leaves': 31,
 'feature_fraction': 1.0,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20}

# バリデーション

In [15]:
# 関数を定義
# 評価指標はROC
def get_evaluate(y_test, predict):
    fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)
    accuracy = metrics.auc(fpr, tpr)
    roc = metrics.roc_auc_score(y_test, predict)      
    return roc

In [16]:
# KFoldクラスを用いて分割 
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
roc_list = []

for train_index, test_index in cv.split(X_train, y_train):
    X_tr = X_train.iloc[train_index]
    y_tr = y_train.iloc[train_index]
    X_va = X_train.iloc[test_index]
    y_va = y_train.iloc[test_index]
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_test = lgb.Dataset(X_test)
    
#     パラメーターチューニングで作成したbest_paramsを利用する。
    clf = lgb.train(best_params, lgb_train,100)
    y_predict = clf.predict(X_va, num_iteration=clf.best_iteration)
    roc= get_evaluate(y_va, y_predict)
    print('roc:{}'.format(roc))
    roc_list.append(roc)
print('Kfold平均 roc:{}'.format(np.mean(roc_list)))
y_predict

roc:0.8515933102434623
roc:0.8514929590445838
roc:0.8380754580380401
roc:0.8451371809983944
Kfold平均 roc:0.8465747270811201


array([0.00697243, 0.00103702, 0.06979217, ..., 0.03946388, 0.14313303,
       0.01468361])

# lightgbmによる予測モデル作成

In [17]:
# train_test_splitによるデータ分割
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'metric' :'auc',
    'objective': 'binary'
}

model = lgb.train(
    best_params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)


Training until validation scores don't improve for 10 rounds
[10]	training's auc: 0.855182	valid_1's auc: 0.820093
[20]	training's auc: 0.882354	valid_1's auc: 0.837667
[30]	training's auc: 0.898999	valid_1's auc: 0.844043
[40]	training's auc: 0.912523	valid_1's auc: 0.846599
[50]	training's auc: 0.923456	valid_1's auc: 0.845881
Early stopping, best iteration is:
[42]	training's auc: 0.914538	valid_1's auc: 0.847076


# 予測値を提出する

In [18]:
submit = submit.drop(1, axis=1)
submit['1'] = y_pred

In [19]:
submit

Unnamed: 0,0,1
0,0,0.742985
1,1,0.067566
2,2,0.028508
3,3,0.003566
4,4,0.058677
...,...,...
18045,18045,0.020304
18046,18046,0.006221
18047,18047,0.067945
18048,18048,0.007265
