# Data Load & Simple Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
path = '/content/gdrive/MyDrive/dacon/jobcare/'
import os
os.listdir(path)

['prediction_1224.csv',
 '참가자_제공_레이아웃.pdf',
 '속성_D_코드.csv',
 'sample_submission.csv',
 '속성_H_코드.csv',
 '속성_L_코드.csv',
 'test.csv',
 'train.csv',
 'prediction_0103.csv',
 'prediction_0104.csv',
 'prediction_0112.csv',
 'prediction_0112_1.csv',
 'prediction.csv',
 'p_rediction.csv']

## Library & Data Load

In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [6]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=88a210c4ddb41d95b28ca6ab6eff6690ce6407091cdecc55aac310e51778aa9e
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
!pip install optuna

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import f1_score 
from bayes_opt import BayesianOptimization
from sklearn.feature_extraction import FeatureHasher

from sklearn.cluster import KMeans
from lightgbm import LGBMClassifier 
from catboost import Pool,CatBoostClassifier

In [8]:
train = pd.read_csv(path + "train.csv")
X_test = pd.read_csv(path + "test.csv")
submission = pd.read_csv(path + "sample_submission.csv")

d_code = pd.read_csv(path + '속성_D_코드.csv', index_col=0)
h_code = pd.read_csv(path + '속성_H_코드.csv', index_col=0)
l_code = pd.read_csv(path + '속성_L_코드.csv', index_col=0)

In [9]:
encoder = LabelEncoder()

In [10]:
for i in range(d_code.shape[1]):
  d_code.iloc[:,i] = encoder.fit_transform(d_code.astype('str').iloc[:,i])

for i in range(h_code.shape[1]):
  h_code.iloc[:,i] = encoder.fit_transform(h_code.astype('str').iloc[:,i])

for i in range(l_code.shape[1]):
  l_code.iloc[:,i] = encoder.fit_transform(l_code.astype('str').iloc[:,i])  

In [11]:
d_code = d_code.T.to_dict()
h_code = h_code.T.to_dict()
l_code = l_code.T.to_dict()

분류코드에 따라 아래와 같이 구분자를 붙혀 구분합니다.
- n: 세분류코드 
- s: 소분류코드   
- m: 중분류코드 
- l: 대분류코드 
- u: 상위코드

In [12]:
def add_code(df, d_code, h_code, l_code):
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df

train = add_code(train, d_code, h_code, l_code)
X_test = add_code(X_test, d_code, h_code, l_code)
print("train_data.shape: ", train.shape)
print("test_data.shape: ", X_test.shape)

train_data.shape:  (501951, 63)
test_data.shape:  (46404, 62)


In [13]:
data = [train, X_test]

for i in range(2):
  data[i].contents_open_dt = pd.to_datetime(data[i].contents_open_dt)

  data[i]['month'] = data[i].contents_open_dt.dt.month
  data[i]['day'] = data[i].contents_open_dt.dt.day
  data[i]['week'] = data[i].contents_open_dt.dt.isocalendar().week
  data[i]['dayofweek'] = data[i].contents_open_dt.dt.dayofweek
  data[i]['hour'] = data[i].contents_open_dt.dt.hour
#  data[i]['minute'] = data[i].contents_open_dt.dt.minute

  data[i].drop(['id', 'contents_open_dt'], axis=1, inplace=True)

In [14]:
for i in range(train.shape[1]):
  train.iloc[:,i] = train.iloc[:,i].astype(int)
for i in range(X_test.shape[1]):
  X_test.iloc[:,i] = X_test.iloc[:,i].astype(int)

In [15]:
X_train = train.drop('target',axis=1)
y_train = train['target']

In [16]:
X_train = X_train.drop(['person_rn', 'contents_rn', 'person_prefer_f', 'person_prefer_g'],axis=1)
X_test = X_test.drop(['person_rn', 'contents_rn', 'person_prefer_f', 'person_prefer_g'],axis=1)

# 시각화

In [None]:
X_train.columns

Index(['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_h_1', 'person_prefer_h_2',
       'person_prefer_h_3', 'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_prefer_d_1_n', 'person_prefer_d_1_s',
       'person_prefer_d_1_m', 'person_prefer_d_1_l', 'person_prefer_d_2_n',
       'person_prefer_d_2_s', 'person_prefer_d_2_m', 'person_prefer_d_2_l',
       'person_prefer_d_3_n', 'person_prefer_d_3_s', 'person_prefer_d_3_m',
       'person_prefer_d_3_l', 'contents_attribute_d_n',
      

n: 세분류코드
s: 소분류코드
m: 중분류코드
l: 대분류코드
u: 상위코드

## 속성 D

In [None]:
pd.crosstab([train.d_l_match_yn,train.d_m_match_yn],train.d_s_match_yn,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,d_s_match_yn,0,1,All
d_l_match_yn,d_m_match_yn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,186683,0,186683
1,0.0,181641,0,181641
1,1.0,58370,75257,133627
All,,426694,75257,501951


In [None]:
pd.crosstab([train.d_l_match_yn,train.d_m_match_yn,train.d_s_match_yn],train.h_s_match_yn,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,h_s_match_yn,0,1,All
d_l_match_yn,d_m_match_yn,d_s_match_yn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,133592,53091,186683
1,0.0,0.0,128791,52850,181641
1,1.0,0.0,43860,14510,58370
1,1.0,1.0,59079,16178,75257
All,,,365322,136629,501951


In [None]:
pd.crosstab([train.d_l_match_yn,train.d_m_match_yn,train.d_s_match_yn],train.target,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,target,0,1,All
d_l_match_yn,d_m_match_yn,d_s_match_yn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,110696,75987,186683
1,0.0,0.0,86696,94945,181641
1,1.0,0.0,23914,34456,58370
1,1.0,1.0,29800,45457,75257
All,,,251106,250845,501951


In [None]:
import matplotlib.pyplot as plt
import numpy as np

### person_prefer_d_1

In [None]:
train['person_prefer_d_1_n'].value_counts().nlargest(10)

114     61374
1227    30041
101     28238
175     24567
110     19855
145     15539
452     14526
854     13180
1086    12797
968     12652
Name: person_prefer_d_1_n, dtype: int64

In [None]:
train[train.d_s_match_yn==1]['person_prefer_d_1_n'].value_counts().nlargest(10)

114     9793
854     5017
1227    4941
968     4224
101     3369
452     3203
514     2982
110     2510
847     2384
145     1870
Name: person_prefer_d_1_n, dtype: int64

In [None]:
train[train.target==1]['person_prefer_d_1'].value_counts().nlargest(10)

114     20714
102     12345
1227    10891
857      6435
181      6409
452      4154
110      4119
851      4083
175      3922
122      3886
Name: person_prefer_d_1, dtype: int64

In [None]:
train[train.target==0]['person_prefer_d_1'].value_counts().nlargest(10)

114     21006
1227    13446
102     13125
181      6804
857      6624
452      5169
175      4462
851      4306
122      4177
110      3825
Name: person_prefer_d_1, dtype: int64

In [None]:
train['person_prefer_d_1_l'].value_counts()

1       207446
926     127675
744      50314
618      34409
377      26351
216      26244
482      11144
864      10069
522       7398
1235       897
1258         4
Name: person_prefer_d_1_l, dtype: int64

### person_prefer_d_2

In [None]:
train['person_prefer_d_2_n'].value_counts().nlargest(10)

175     71391
1086    24410
114     22957
110     21589
101     19643
1227    19163
968     14179
847     14130
452     13781
136     13701
Name: person_prefer_d_2_n, dtype: int64

In [None]:
train[train.d_s_match_yn==1]['person_prefer_d_2_n'].value_counts().nlargest(10)

175     9684
847     4772
968     4035
1086    3567
110     3514
452     2953
464     2494
101     2424
703     2269
114     2223
Name: person_prefer_d_2_n, dtype: int64

In [None]:
train[train.target==1]['person_prefer_d_2'].value_counts().nlargest(10)

181     24539
1086     8476
175      8078
113      7394
102      7197
851      6410
1227     5459
114      5156
857      5119
46       4018
Name: person_prefer_d_2, dtype: int64

In [None]:
train[train.target==0]['person_prefer_d_2'].value_counts().nlargest(10)

181     25320
1086    10784
175      8480
102      7496
851      6631
113      6375
1227     6179
114      5441
857      5310
46       3757
Name: person_prefer_d_2, dtype: int64

In [None]:
train['person_prefer_d_2_l'].value_counts()

1       209086
926     128544
744      53946
618      39885
377      26173
216      21259
864       8062
482       7211
522       6772
1235      1008
1258         5
Name: person_prefer_d_2_l, dtype: int64

### person_prefer_d_3

In [None]:
train['person_prefer_d_3_n'].value_counts().nlargest(10)

175     59632
114     43089
92      31027
1227    27717
854     27110
145     23538
101     15762
110     13473
1086    11498
968      9523
Name: person_prefer_d_3_n, dtype: int64

In [None]:
train[train.d_s_match_yn==1]['person_prefer_d_3_n'].value_counts().nlargest(10)

175     11080
114      7124
1227     6941
854      4402
92       3802
145      3113
968      2791
110      1769
452      1701
709      1669
Name: person_prefer_d_3_n, dtype: int64

In [None]:
train[train.target==1]['person_prefer_d_3'].value_counts().nlargest(10)

175     23967
114     12902
857     12347
95      11473
1227     7816
145      6828
122      4684
1228     4236
182      3999
1085     3952
Name: person_prefer_d_3, dtype: int64

In [None]:
train[train.target==0]['person_prefer_d_3'].value_counts().nlargest(10)

175     25082
857     14667
114     13745
95      11882
1227     8361
145      5860
1085     5011
122      4647
1228     4641
1086     4100
Name: person_prefer_d_3, dtype: int64

In [None]:
train['person_prefer_d_3_l'].value_counts()

1       230664
926     126441
744      59759
618      34787
216      15769
377      13052
864      10357
522       6567
482       3602
1235       946
1258         7
Name: person_prefer_d_3_l, dtype: int64

### contents_attribute_d

In [None]:
train['contents_attribute_d_n'].value_counts().nlargest(10)

114     57961
1227    37340
101     21266
854     19056
175     18690
145     17774
136     15568
968     14293
110     14001
452     13536
Name: contents_attribute_d_n, dtype: int64

In [None]:
train[train.d_s_match_yn==1]['contents_attribute_d_n'].value_counts().nlargest(10)

114     9793
854     5017
1227    4941
968     4224
101     3369
452     3203
514     2982
110     2510
847     2384
145     1870
Name: contents_attribute_d_n, dtype: int64

In [None]:
train[train.target==1]['contents_attribute_d'].value_counts().nlargest(10)

114     13599
1227    11543
102     10699
857      9299
181      7258
117      5599
118      5580
112      4490
95       4461
147      4352
Name: contents_attribute_d, dtype: int64

In [None]:
train[train.target==0]['contents_attribute_d'].value_counts().nlargest(10)

1227    13125
857      9509
114      9429
102      8022
181      5534
453      4370
118      4318
117      4022
95       3579
147      3499
Name: contents_attribute_d, dtype: int64

In [None]:
train['contents_attribute_d_l'].value_counts()

1       183291
926     138782
744      56937
618      38772
216      28832
377      25078
864      11651
482       9878
522       7001
1235      1725
1258         4
Name: contents_attribute_d_l, dtype: int64

## 속성 H

In [None]:
pd.crosstab([train.h_l_match_yn,train.h_m_match_yn],train.h_s_match_yn,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,h_s_match_yn,0,1,All
h_l_match_yn,h_m_match_yn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,102613,0,102613
1,0.0,216205,0,216205
1,1.0,46504,136629,183133
All,,365322,136629,501951


In [None]:
pd.crosstab([train.h_l_match_yn,train.h_m_match_yn,train.h_s_match_yn],train.d_s_match_yn,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,d_s_match_yn,0,1,All
h_l_match_yn,h_m_match_yn,h_s_match_yn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,82352,20261,102613
1,0.0,0.0,183093,33112,216205
1,1.0,0.0,40798,5706,46504
1,1.0,1.0,120451,16178,136629
All,,,426694,75257,501951


In [None]:
pd.crosstab([train.h_l_match_yn,train.h_m_match_yn,train.h_s_match_yn],train.target,
            margins=True).style.background_gradient(cmap='summer_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,target,0,1,All
h_l_match_yn,h_m_match_yn,h_s_match_yn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,53255,49358,102613
1,0.0,0.0,106752,109453,216205
1,1.0,0.0,23454,23050,46504
1,1.0,1.0,67645,68984,136629
All,,,251106,250845,501951


### person_prefer_h_1

In [None]:
train['person_prefer_h_1_m'].value_counts().nlargest(10)

316    35257
342    31393
315    22458
543    22381
368    20938
359    19155
379    13124
391    12664
407    10619
547    10256
Name: person_prefer_h_1_m, dtype: int64

In [None]:
train[train.h_s_match_yn==1]['person_prefer_h_1_m'].value_counts().nlargest(10)

547    7588
523    5479
422    4997
413    4628
403    4115
434    3925
405    3585
550    3455
543    3403
421    3318
Name: person_prefer_h_1_m, dtype: int64

In [None]:
train[train.target==1]['person_prefer_h_1'].value_counts().nlargest(10)

4      20199
31     15995
2      11559
59     10734
49      9011
279     6421
86      6301
72      6075
95      5083
288     4865
Name: person_prefer_h_1, dtype: int64

In [None]:
train[train.target==0]['person_prefer_h_1'].value_counts().nlargest(10)

31     15398
4      15058
2      10899
59     10204
49     10144
72      7049
279     6432
86      6363
288     5391
95      4816
Name: person_prefer_h_1, dtype: int64

In [None]:
train['person_prefer_h_1_l'].value_counts()

94     104841
3       57367
277     50147
30      49806
58      39580
48      29714
250     26399
1       22458
85      19092
208     18482
71      17564
188     16261
169     13645
226     11556
78      10159
149      9291
308      3564
92       1911
312       114
Name: person_prefer_h_1_l, dtype: int64

### person_prefer_h_2

In [None]:
train['person_prefer_h_2_m'].value_counts().nlargest(10)

398    42383
316    36033
542    32846
518    23068
543    22593
342    15713
368    13137
495    13033
407    11649
408    10351
Name: person_prefer_h_2_m, dtype: int64

In [None]:
train[train.h_s_match_yn==1]['person_prefer_h_2_m'].value_counts().nlargest(10)

543    10425
368     5733
408     5698
432     5499
481     5402
407     4796
518     4389
399     4196
464     3048
379     3015
Name: person_prefer_h_2_m, dtype: int64

In [None]:
train[train.target==1]['person_prefer_h_2'].value_counts().nlargest(10)

95     22968
4      18885
278    16736
251    10508
31      7912
59      6808
227     6038
279     5966
189     5255
96      4399
Name: person_prefer_h_2, dtype: int64

In [None]:
train[train.target==0]['person_prefer_h_2'].value_counts().nlargest(10)

95     19415
4      17148
278    16110
251    12560
31      7801
227     6995
279     6552
59      6329
189     5091
116     4521
Name: person_prefer_h_2, dtype: int64

In [None]:
train['person_prefer_h_2_l'].value_counts()

94     132520
277     71442
3       62029
250     38109
30      36536
58      27943
188     22271
226     20101
208     18141
48      15545
169     13430
85      10092
149      8774
71       8327
1        7254
78       5244
92       1954
312      1162
308      1077
Name: person_prefer_h_2_l, dtype: int64

### person_prefer_h_3

In [None]:
train['person_prefer_h_3_m'].value_counts().nlargest(10)

368    36745
398    33260
316    29087
391    27323
542    21123
523    18309
547    18249
315    17665
359    15965
543    14559
Name: person_prefer_h_3_m, dtype: int64

In [None]:
train[train.h_s_match_yn==1]['person_prefer_h_3_m'].value_counts().nlargest(10)

359    7887
403    7413
543    6887
398    6739
542    6729
407    5425
480    5251
399    4238
547    4237
402    4050
Name: person_prefer_h_3_m, dtype: int64

In [None]:
train[train.target==1]['person_prefer_h_3'].value_counts().nlargest(10)

59     20527
95     16896
4      14868
86     13888
278    10277
288     8887
258     8580
2       8519
49      7291
109     6421
Name: person_prefer_h_3, dtype: int64

In [None]:
train[train.target==0]['person_prefer_h_3'].value_counts().nlargest(10)

95     16364
59     16218
4      14219
86     13435
278    10846
258     9729
288     9362
2       9146
49      8674
109     6464
Name: person_prefer_h_3, dtype: int64

In [None]:
train['person_prefer_h_3_l'].value_counts()

94     126049
277     61468
3       56545
58      50513
250     32090
85      30162
30      24318
48      23601
208     18498
1       17665
188     14361
92      12188
226     11261
169      7988
71       5896
78       4807
149      3179
312       824
308       538
Name: person_prefer_h_3_l, dtype: int64

### contents_attribute_h

In [None]:
train['contents_attribute_h_l'].value_counts().nlargest(10)

94     120548
277     53943
30      49515
3       48204
58      38628
250     31035
48      28241
208     19843
188     19523
85      18687
Name: contents_attribute_h_l, dtype: int64

In [None]:
train[train.h_s_match_yn==1]['contents_attribute_h_l'].value_counts().nlargest(10)

94     39579
277    20196
250    13348
58      7828
208     7156
149     7007
226     6698
30      6672
188     5760
48      4847
Name: contents_attribute_h_l, dtype: int64

In [None]:
train[train.target==1]['contents_attribute_h'].value_counts().nlargest(10)

139    6960
288    6178
68     5884
65     5856
127    5606
43     5572
118    5043
109    4384
280    4267
27     3875
Name: contents_attribute_h, dtype: int64

In [None]:
train[train.target==0]['contents_attribute_h'].value_counts().nlargest(10)

139    8070
288    7252
68     5993
127    5637
43     5446
65     5080
118    4344
109    3940
138    3810
280    3740
Name: contents_attribute_h, dtype: int64

In [None]:
train['contents_attribute_h_l'].value_counts()

94     120548
277     53943
30      49515
3       48204
58      38628
250     31035
48      28241
208     19843
188     19523
85      18687
71      17622
169     15654
226     13845
149     10291
78       9616
308      4344
92       2412
Name: contents_attribute_h_l, dtype: int64

## 속성 L

In [None]:
train['contents_attribute_l_l'].value_counts()

2006    219214
2010     57894
2020     42533
2009     42458
2017     39593
2016     24713
2011     13881
2019     10472
2022      8926
2015      8478
2013      7835
2012      7232
2008      6310
2021      2868
2014      2656
2004      2353
2007      2101
2018      2048
2005       319
2023        62
2024         5
Name: contents_attribute_l_l, dtype: int64

In [None]:
train[train.target==0]['contents_attribute_l_l'].value_counts().nlargest(10)

2006    108497
2010     27519
2020     24847
2009     20084
2017     18726
2016     10960
2011      6719
2019      6258
2022      4891
2012      4223
Name: contents_attribute_l_l, dtype: int64

In [None]:
train[train.target==1]['contents_attribute_l_l'].value_counts().nlargest(10)

2006    110717
2010     30375
2009     22374
2017     20867
2020     17686
2016     13753
2011      7162
2015      4621
2013      4282
2019      4214
Name: contents_attribute_l_l, dtype: int64

### sweetviz 시각화

In [None]:
!pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.1.3-py3-none-any.whl (15.1 MB)
[K     |████████████████████████████████| 15.1 MB 5.9 MB/s 
Installing collected packages: sweetviz
Successfully installed sweetviz-2.1.3


In [None]:
import sweetviz as sv

In [None]:
advert_report = sv.analyze(train)
#display the report
advert_report.show_html('./sweetviz_Advertising.html')

                                             |          | [  0%]   00:00 -> (? left)

Report ./sweetviz_Advertising.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
from sklearn.feature_extraction import FeatureHasher

# 변수처리

## 카테고리 10개 이하 변수처리

In [17]:
col_name = list(X_train.columns)

In [18]:
object_nunique = list(map(lambda col: X_train[col].nunique(), col_name))
d = dict(zip(col_name, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

[('d_l_match_yn', 2),
 ('d_m_match_yn', 2),
 ('d_s_match_yn', 2),
 ('h_l_match_yn', 2),
 ('h_m_match_yn', 2),
 ('h_s_match_yn', 2),
 ('person_attribute_a', 2),
 ('contents_attribute_j', 2),
 ('contents_attribute_k', 2),
 ('contents_attribute_i', 3),
 ('contents_attribute_a', 3),
 ('contents_attribute_c', 4),
 ('person_prefer_c', 5),
 ('contents_attribute_m', 5),
 ('person_attribute_b', 6),
 ('dayofweek', 7),
 ('person_attribute_a_1', 8),
 ('contents_attribute_j_1', 9),
 ('person_prefer_d_1_l', 11),
 ('person_prefer_d_2_l', 11),
 ('person_prefer_d_3_l', 11),
 ('contents_attribute_d_l', 11),
 ('month', 11),
 ('person_prefer_e', 12),
 ('contents_attribute_e', 12),
 ('contents_attribute_h_l', 17),
 ('person_prefer_h_1_l', 19),
 ('person_prefer_h_2_l', 19),
 ('person_prefer_h_3_l', 19),
 ('contents_attribute_l_l', 21),
 ('hour', 24),
 ('day', 31),
 ('person_prefer_d_1_m', 36),
 ('person_prefer_d_2_m', 36),
 ('person_prefer_d_3_m', 36),
 ('contents_attribute_d_m', 36),
 ('week', 49),
 ('cont

In [19]:
d = pd.DataFrame([col_name, object_nunique]).T
onehot_list = list(d[(d[1]>2)&(d[1]<=10)][0])

#### 범주 갯수가 10개 이하인 변수에 대해서 타겟값이 0.5이상이면 1 아니면 0인 변수 추가

#### 범주 갯수가 10개 이하인 변수에 대해서 타겟값이 0.55이상이면 1, 0.45~0.55사이면 2, 0.45 이하이면 3인 변수 추가

In [20]:
df = pd.concat([X_train, y_train], axis=1)
df_test = X_test.copy()

for col in onehot_list:
    temp_df = []
        
    # 명목형 변수에서 각 값 별로 타겟값의 평균을 대입
    feat = df.groupby(col)["target"].agg("mean")
    feat = feat.to_dict()
    df_test.loc[:, f"tar_enc_{col}"] = X_test[col].map(feat)
    temp_df.append(df_test)

    temp_train_feat = df[col].map(feat)
    temp_test_feat = df_test[col].map(feat)

    df.loc[:, f"tar_enc_{col}"] = temp_train_feat
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat

df.drop('target', axis=1, inplace=True)

In [21]:
def div_value_2(temp):
    new_value = ''

    if temp >=0.5 : new_value = 1
    else : new_value = 0
    
    return new_value


def div_value_3(temp):
    new_value = ''

    if temp >=0.55 : new_value = 1
    elif temp <=0.45 : new_value = -1
    else : new_value = 0
    
    return new_value

In [22]:
div_value = list(df.iloc[:,X_train.shape[1]:X_train.shape[1]+abs(X_train.shape[1] - df.shape[1])].columns)

for i in range(len(div_value)):
  X_train.loc[:, f"tar_div2_{onehot_list[i]}"] = df.loc[:,div_value[i]].apply(lambda x : div_value_2(x))
  X_test.loc[:, f"tar_div2_{onehot_list[i]}"] = df_test.loc[:, div_value[i]].apply(lambda x : div_value_2(x))
  
for i in range(len(div_value)):
  X_train.loc[:, f"tar_div3_{onehot_list[i]}"] = df.loc[:, div_value[i]].apply(lambda x : div_value_3(x))
  X_test.loc[:, f"tar_div3_{onehot_list[i]}"] = df_test.loc[:, div_value[i]].apply(lambda x : div_value_3(x))

In [23]:
drop_feature = X_train.columns[X_train.nunique() < 2].tolist()

In [24]:
X_train.drop(drop_feature,axis=1, inplace=True)
X_test.drop(drop_feature,axis=1, inplace=True)

## 카테고리 10개 초과 변수 처리

In [25]:
up10_cat = list(d[d[1]>10][0])

In [26]:
train_up = df[up10_cat]
test_up  = X_test[up10_cat]

train_up['target'] = y_train

In [27]:
train_1 = train_up[train_up['target']==1]
train_0 = train_up[train_up['target']==0]

In [28]:
# 타겟의 비율 0.5%이상 차이나는 카테고리 구하기
list_diff=[]
for i in up10_cat[:-1]:
  serise=abs((train_1[i].value_counts()/train_1[i].shape[0])*100 - (train_0[i].value_counts()/train_0[i].shape[0])*100)
  temp_list=serise[serise>=0.5].index.tolist()
  list_diff.append(temp_list)

# 0.5차이 나는 값이 1개 이상인 카테고리 딕셔너리로 담기 / 0.5 차이가 나는 변수가 하나인 컬럼은 dict에서 제외
dict1 = {up10_cat[i]:list_diff[i] for i in range(len(up10_cat[:-1])) if len(list_diff[i])>1 }

In [29]:
# 컬럼 추가
for data in [train_up,test_up]:
  for key,values in dict1.items():
    data.loc[:, f"tar_enc_1_10_div_{key}"] = 0
    for i in values:
      con=data[data[key]==i].index
      data.loc[con, f"tar_enc_1_10_div_{key}"] = 1

In [30]:
#기존의 0.5이상인 컬럼 값은 들고오고, 이하인것만 0으로 된 컬럼 추가
for data in [train_up,test_up]:
  for key,values in dict1.items():
    data.loc[:, f"tar_enc_2_10_div_{key}"] = 0
    for i in values:
      con=data[data[key]==i].index
      data.loc[con, f"tar_enc_2_10_div_{key}"] = i

In [31]:
X_train = pd.concat([X_train, train_up.iloc[:,44:]], axis=1)
X_test = pd.concat([X_test, test_up.iloc[:,43:]], axis=1)

## 비지도학습으로 데이터 그룹 변수 추가

In [32]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(X_train)
X_train['cluster_5'] = kmeans.predict(X_train)
X_test['cluster_5'] = kmeans.predict(X_test)

In [33]:
kmeans = KMeans(n_clusters=10, random_state=42).fit(X_train)
X_train['cluster_10'] = kmeans.predict(X_train)
X_test['cluster_10'] = kmeans.predict(X_test)

In [34]:
kmeans = KMeans(n_clusters=20, random_state=42).fit(X_train)
X_train['cluster_20'] = kmeans.predict(X_train)
X_test['cluster_20'] = kmeans.predict(X_test)

In [35]:
kmeans = KMeans(n_clusters=30, random_state=42).fit(X_train)
X_train['cluster_30'] = kmeans.predict(X_train)
X_test['cluster_30'] = kmeans.predict(X_test)

In [36]:
kmeans = KMeans(n_clusters=40, random_state=42).fit(X_train)
X_train['cluster_40'] = kmeans.predict(X_train)
X_test['cluster_40'] = kmeans.predict(X_test)

In [37]:
kmeans = KMeans(n_clusters=50, random_state=42).fit(X_train)
X_train['cluster_50'] = kmeans.predict(X_train)
X_test['cluster_50'] = kmeans.predict(X_test)

In [None]:
X_train.to_csv(path + "X_train_last.csv", index=False)
X_test.to_csv(path + "X_test_last.csv", index=False)

#EDA

In [None]:
X_train_hash=X_train.copy()
X_test_hash=X_test.copy()

for i in range(X_train.shape[1]):
    X_train_hash.iloc[:,i]=X_train.iloc[:,i].astype('str')      
    X_test_hash.iloc[:,i]=X_test.iloc[:,i].astype('str')  
hashing = FeatureHasher(input_type='string')
X_train_hash = hashing.transform(X_train_hash.values)
X_test_hash = hashing.transform(X_test_hash.values)

In [None]:
X_train_onehot=X_train.copy()
X_test_onehot=X_test.copy()

for i in range(X_train.shape[1]):
    X_train_onehot.iloc[:,i]=X_train.iloc[:,i].astype('str')      
    X_test_onehot.iloc[:,i]=X_test.iloc[:,i].astype('str')     

In [None]:
oneH = OneHotEncoder()
X_train_onehot = oneH.fit_transform(X_train_onehot)

In [None]:
# 기존 데이터
model = LGBMClassifier(
        random_state=42,
        n_jobs=-1)

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.6172796508657897

In [None]:
# D, H, L 속성 LabelEncoding
model = LGBMClassifier(
        random_state=42,
        n_jobs=-1)

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.6174978294350812

In [None]:
# FeatureHasher
model = LGBMClassifier(
        random_state=42,
        n_jobs=-1)

cross_val_score(model, X_train_hash, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.5913334041123397

In [None]:
# OneHotEncoding
model = LGBMClassifier(
        random_state=42,
        n_jobs=-1)

cross_val_score(model, X_train_onehot, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.6089657380855458

FeatureHasher 이건 아닌듯

# Model (Catboost)

In [None]:
model = CatBoostClassifier(random_state=123,task_type='GPU',
                               eval_metric="F1")
    
model.fit(X_train, y_train)

Learning rate set to 0.023946
0:	learn: 0.6208178	total: 31.8ms	remaining: 31.8s
1:	learn: 0.6208399	total: 60.2ms	remaining: 30s
2:	learn: 0.5804360	total: 87.7ms	remaining: 29.1s
3:	learn: 0.5992765	total: 119ms	remaining: 29.8s
4:	learn: 0.6198778	total: 147ms	remaining: 29.3s
5:	learn: 0.6170347	total: 176ms	remaining: 29.1s
6:	learn: 0.6053132	total: 204ms	remaining: 29s
7:	learn: 0.6143202	total: 239ms	remaining: 29.7s
8:	learn: 0.6171703	total: 268ms	remaining: 29.5s
9:	learn: 0.6185076	total: 306ms	remaining: 30.3s
10:	learn: 0.6144278	total: 334ms	remaining: 30.1s
11:	learn: 0.6166647	total: 362ms	remaining: 29.8s
12:	learn: 0.5963827	total: 389ms	remaining: 29.6s
13:	learn: 0.6150622	total: 417ms	remaining: 29.4s
14:	learn: 0.5933797	total: 455ms	remaining: 29.9s
15:	learn: 0.5963806	total: 482ms	remaining: 29.6s
16:	learn: 0.5987368	total: 510ms	remaining: 29.5s
17:	learn: 0.5975641	total: 538ms	remaining: 29.3s
18:	learn: 0.6157741	total: 566ms	remaining: 29.2s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x7f0444fd6710>

In [None]:
pd.Series(model.feature_importances_,X_train.columns).sort_values(ascending=False)

d_l_match_yn              9.951124
contents_attribute_j_1    6.390700
contents_attribute_d      5.925581
contents_attribute_e      4.541132
person_prefer_d_1_n       3.877647
                            ...   
8                         0.000000
6                         0.000000
2                         0.000000
1                         0.000000
20                        0.000000
Length: 120, dtype: float64

In [41]:
n_est = 3000
seed = 42
n_fold = 7
n_class = 1

X = X_train.copy()
y = y_train

In [44]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = 0
cat_cols = X_train.columns[X_train.nunique() > 2].tolist()
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier(eval_metric="F1", task_type='GPU')
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = pd.DataFrame(model_cat.predict_proba(X_valid)[:, 1], index=X_valid.index)
  cat_pred_test = cat_pred_test + (model_cat.predict_proba(X_test)[:, 1] / n_fold)
  pred = np.where(cat_pred >= 0.4 , 1, 0)
  pred_test = np.where(cat_pred_test >= 0.4 , 1, 0)

  print(f'CV F1 score: {f1_score(y_valid, pred[valid_idx]):.6f}')
    
print(f'\tF1 score: {f1_score(y, pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.042526
0:	learn: 0.5989088	test: 0.6027335	best: 0.6027335 (0)	total: 732ms	remaining: 12m 11s
100:	learn: 0.6449051	test: 0.6580487	best: 0.6580487 (100)	total: 1m 7s	remaining: 9m 59s
200:	learn: 0.6538745	test: 0.6693628	best: 0.6693628 (200)	total: 2m 7s	remaining: 8m 27s
300:	learn: 0.6595767	test: 0.6734015	best: 0.6739130 (291)	total: 3m 7s	remaining: 7m 15s
400:	learn: 0.6636661	test: 0.6775199	best: 0.6775199 (400)	total: 4m 5s	remaining: 6m 6s
500:	learn: 0.6673565	test: 0.6764620	best: 0.6776740 (450)	total: 5m 4s	remaining: 5m 3s
bestTest = 0.6776739712
bestIteration = 450
Shrink model to first 451 iterations.
CV F1 score: 0.707395

----------------- Fold 1 -----------------

Learning rate set to 0.042526
0:	learn: 0.6128201	test: 0.6174224	best: 0.6174224 (0)	total: 731ms	remaining: 12m 10s
100:	learn: 0.6433431	test: 0.6553459	best: 0.6553459 (100)	total: 1m 6s	remaining: 9m 48s
200:	learn: 0.6531491	test

In [46]:
submission['target'] = pred_test
submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [47]:
submission.to_csv(path + "prediction_0117.csv", index=False)

In [None]:
is_holdout = False
n_splits = 5
iterations = 1000
patience = 20

cv = KFold(n_splits=n_splits, shuffle=True, random_state=123)

In [None]:
scores = []
models = []


models = []
for tri, vai in cv.split(X_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations,random_state=123,task_type='GPU',
                               eval_metric="F1", cat_features=cat_features,
                               one_hot_max_size=4)
    
    model.fit(X_train.iloc[tri], y_train[tri], 
            eval_set=[(X_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

In [None]:
threshold = 0.4
pred_list = []
scores = []

for i,(tri, vai) in enumerate( cv.split(X_train) ):
    pred = models[i].predict_proba(X_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(X_test)[:, 1]
    pred_list.append(pred)
    
pred = np.mean(pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)
submission['target'] = pred
submission.to_csv(path + "prediction_0117_2.csv", index=False)

----------------

# 번외

## AutoML

In [None]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-2.3.5-py3-none-any.whl (288 kB)
[?25l[K     |█▏                              | 10 kB 27.7 MB/s eta 0:00:01[K     |██▎                             | 20 kB 13.8 MB/s eta 0:00:01[K     |███▍                            | 30 kB 10.7 MB/s eta 0:00:01[K     |████▌                           | 40 kB 9.3 MB/s eta 0:00:01[K     |█████▊                          | 51 kB 5.5 MB/s eta 0:00:01[K     |██████▉                         | 61 kB 5.7 MB/s eta 0:00:01[K     |████████                        | 71 kB 5.5 MB/s eta 0:00:01[K     |█████████                       | 81 kB 6.2 MB/s eta 0:00:01[K     |██████████▏                     | 92 kB 4.8 MB/s eta 0:00:01[K     |███████████▍                    | 102 kB 5.1 MB/s eta 0:00:01[K     |████████████▌                   | 112 kB 5.1 MB/s eta 0:00:01[K     |█████████████▋                  | 122 kB 5.1 MB/s eta 0:00:01[K     |██████████████▊                 | 133 kB 5.1 MB/s eta 0:00:01[K

In [None]:
from pycaret.classification import *

In [None]:
train_automl = X_train.copy()
train_automl['target'] = y_train

In [None]:
train_automl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501951 entries, 0 to 501950
Data columns (total 63 columns):
 #   Column                  Non-Null Count   Dtype
---  ------                  --------------   -----
 0   d_l_match_yn            501951 non-null  int64
 1   d_m_match_yn            501951 non-null  int64
 2   d_s_match_yn            501951 non-null  int64
 3   h_l_match_yn            501951 non-null  int64
 4   h_m_match_yn            501951 non-null  int64
 5   h_s_match_yn            501951 non-null  int64
 6   person_attribute_a      501951 non-null  int64
 7   person_attribute_a_1    501951 non-null  int64
 8   person_attribute_b      501951 non-null  int64
 9   person_prefer_c         501951 non-null  int64
 10  person_prefer_d_1       501951 non-null  int64
 11  person_prefer_d_2       501951 non-null  int64
 12  person_prefer_d_3       501951 non-null  int64
 13  person_prefer_e         501951 non-null  int64
 14  person_prefer_h_1       501951 non-null  int64
 15  

In [None]:
clf = setup(data = train_automl, target = 'target', silent = True, session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(501951, 63)"
5,Missing Values,False
6,Numeric Features,33
7,Categorical Features,29
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
# 라벨 변경 후
best_3 = compare_models(sort = 'F1', n_select = 3, fold= 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6132,0.6603,0.6634,0.6028,0.6317,0.2264,0.2276,11.958
rf,Random Forest Classifier,0.6134,0.6606,0.6329,0.6091,0.6208,0.2268,0.227,140.336
et,Extra Trees Classifier,0.6133,0.6595,0.6259,0.6105,0.6181,0.2267,0.2268,194.106
gbc,Gradient Boosting Classifier,0.6018,0.6437,0.6438,0.5939,0.6178,0.2037,0.2044,269.602
ridge,Ridge Classifier,0.5947,0.0,0.6247,0.5893,0.6065,0.1894,0.1898,1.412
lda,Linear Discriminant Analysis,0.5947,0.634,0.6246,0.5893,0.6064,0.1894,0.1897,22.864
ada,Ada Boost Classifier,0.5943,0.6334,0.6243,0.5889,0.6061,0.1886,0.1889,55.416
nb,Naive Bayes,0.5507,0.5799,0.6903,0.5396,0.6057,0.1015,0.1057,1.404
lr,Logistic Regression,0.5848,0.6208,0.5854,0.5846,0.585,0.1696,0.1696,132.736
knn,K Neighbors Classifier,0.5546,0.5725,0.5765,0.5523,0.5641,0.1092,0.1093,215.042


In [None]:
# 카테고리 처리전
best_3 = compare_models(sort = 'F1', n_select = 3, fold= 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.5014,0.5015,0.8875,0.5008,0.6382,0.0029,0.0044,7.368
lightgbm,Light Gradient Boosting Machine,0.6128,0.6593,0.664,0.6023,0.6317,0.2256,0.2268,8.144
rf,Random Forest Classifier,0.6083,0.6542,0.6211,0.6055,0.6132,0.2166,0.2166,102.734
gbc,Gradient Boosting Classifier,0.5992,0.6417,0.6339,0.5927,0.6126,0.1983,0.1988,162.244
ada,Ada Boost Classifier,0.5931,0.6322,0.6221,0.5879,0.6045,0.1862,0.1865,33.644
et,Extra Trees Classifier,0.6011,0.6429,0.6055,0.6002,0.6028,0.2023,0.2023,112.454
ridge,Ridge Classifier,0.5884,0.0,0.6047,0.5856,0.595,0.1769,0.177,0.762
lda,Linear Discriminant Analysis,0.5884,0.6235,0.6047,0.5856,0.595,0.1769,0.177,11.082
dt,Decision Tree Classifier,0.544,0.544,0.544,0.544,0.544,0.0881,0.0881,11.58
knn,K Neighbors Classifier,0.5324,0.5422,0.5475,0.5314,0.5393,0.0648,0.0648,20.422


- 베이즈 통계와 생성모델에 기반한 나이브 베이즈
  - Naive Bayes	

- 독립변수와 종속변수의 선형 관계성에 기반한 로지스틱 회귀
  - Logistic Regression	
: 회귀를 사용하여 데이터가 어떤 범주에 속할 확률을 0에서 1 사이의 값으로 예측하고 그 확률에 따라 가능성이 더 높은 범주에 속하는 것으로 분류해주는 지도 학습 알고리즘
  - https://hleecaster.com/ml-logistic-regression-concept/

- 데이터 균일도에 따른 규칙 기반의 결정트리
  - Decision Tree Classifier	

- 개별 글래스 간의 최대 분류 마진을 효과적으로 찾아주는 서포트 벡터 머신
  - SVM - Linear Kernel	
  - https://ko.wikipedia.org/wiki/%EC%84%9C%ED%8F%AC%ED%8A%B8_%EB%B2%A1%ED%84%B0_%EB%A8%B8%EC%8B%A0

- 근접 거리를 기준으로 하는 최소 근접 알고리즘
  - K Neighbors Classifier	
  - https://ko.wikipedia.org/wiki/K-%EC%B5%9C%EA%B7%BC%EC%A0%91_%EC%9D%B4%EC%9B%83_%EC%95%8C%EA%B3%A0%EB%A6%AC%EC%A6%98

- 심층 연결 기반의 신경망

- 다변량 데이터에서의 차원축소를 사용한 클래스 분류 방법

  - Linear Discriminant Analysis (가정 - 다변량 정규성 만족)
  - Quadratic Discriminant Analysis	
  - https://ratsgo.github.io/machine%20learning/2017/03/21/LDA/

- 서로 다른 머신러닝 알고리즘을 결합한 앙상블

  - 보팅
    - 하드보팅
    - 소프트보팅

- 배깅
  - Random Forest Classifier (모든 데이터 사용)


- ? 어디에 분류 ??
  - Extra Trees Classifier(랜포 보다 더 랜덤한 모델, 랜덤한 피처사용)
  - https://wyatt37.tistory.com/6

- 부스팅
  - Ada Boost Classifier (오분류 가중치)
  - Gradient Boosting Classifier (경사하강법)
  - XG Boosting Machine (컴퓨팅 성능 강화)
  - Light Gradient Boosting Machine	(비대칭 리프노드 분할)
  - Cat Boosting Classifier	(범주형 변수의 잔치를 반영)
    - https://dailyheumsi.tistory.com/136
  - NG Boosting Classifier (신경망 기반 부스팅)	


- 흐음
  - Ridge Classifier	
  - Dummy Classifier (모델의 base line 명시)

In [None]:
lightgbm = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6156,0.6587,0.6669,0.6049,0.6344,0.2313,0.2325
1,0.6085,0.6563,0.6587,0.5985,0.6272,0.217,0.2181
2,0.6129,0.6609,0.664,0.6024,0.6317,0.2258,0.227
3,0.6104,0.6574,0.6625,0.5999,0.6297,0.2208,0.222
4,0.6104,0.6576,0.6667,0.5992,0.6312,0.2208,0.2222
5,0.6115,0.6581,0.6626,0.6011,0.6304,0.223,0.2242
6,0.611,0.6586,0.6597,0.6011,0.629,0.2219,0.223
7,0.6119,0.6609,0.6604,0.602,0.6299,0.2239,0.2249
8,0.6173,0.6642,0.665,0.607,0.6347,0.2345,0.2356
9,0.6147,0.6627,0.6629,0.6046,0.6324,0.2294,0.2305


In [None]:
qda = create_model('qda')
lightgbm = create_model('lightgbm')
rf = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.611,0.6532,0.6249,0.6079,0.6163,0.2219,0.222
1,0.6038,0.648,0.6103,0.6025,0.6064,0.2077,0.2077
2,0.6109,0.6561,0.6196,0.609,0.6142,0.2218,0.2218
3,0.6109,0.6548,0.6231,0.6082,0.6156,0.2218,0.2219
4,0.6107,0.6555,0.6311,0.6063,0.6184,0.2213,0.2215
5,0.6094,0.6557,0.6243,0.6062,0.6151,0.2189,0.219
6,0.6126,0.6561,0.6236,0.6101,0.6168,0.2251,0.2252
7,0.6124,0.6612,0.6246,0.6096,0.617,0.2247,0.2248
8,0.6112,0.6577,0.6206,0.6091,0.6148,0.2224,0.2224
9,0.6117,0.6591,0.6234,0.6091,0.6162,0.2234,0.2235


In [None]:
qda = tune_model(qda, optimize = 'F1') 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.5855,0.62,0.663,0.574,0.6153,0.1711,0.1732
1,0.5848,0.6201,0.6627,0.5734,0.6148,0.1697,0.1718
2,0.589,0.6254,0.6693,0.5766,0.6195,0.178,0.1803
3,0.5902,0.626,0.6668,0.5782,0.6193,0.1804,0.1826
4,0.5824,0.621,0.6653,0.5706,0.6143,0.1648,0.1671
5,0.5899,0.6238,0.6706,0.5774,0.6205,0.1799,0.1823
6,0.5912,0.6266,0.6689,0.5788,0.6206,0.1823,0.1846
7,0.5917,0.6273,0.6678,0.5795,0.6205,0.1834,0.1855
8,0.5875,0.6253,0.6681,0.5753,0.6182,0.175,0.1773
9,0.5941,0.6311,0.6748,0.581,0.6244,0.1883,0.1908


In [None]:
lightgbm = tune_model(lightgbm, optimize = 'F1') 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6217,0.6699,0.6624,0.6124,0.6365,0.2433,0.2441
1,0.6195,0.666,0.658,0.6109,0.6336,0.2389,0.2396
2,0.622,0.6707,0.6657,0.6121,0.6378,0.244,0.2449
3,0.6213,0.6681,0.6607,0.6125,0.6356,0.2427,0.2434
4,0.6211,0.6704,0.674,0.6095,0.6402,0.2423,0.2437
5,0.6203,0.6674,0.6619,0.611,0.6354,0.2406,0.2415
6,0.6206,0.6709,0.66,0.6117,0.635,0.2412,0.2419
7,0.6222,0.674,0.6632,0.6129,0.637,0.2444,0.2452
8,0.6272,0.6751,0.6692,0.6173,0.6422,0.2544,0.2553
9,0.6263,0.6741,0.6688,0.6163,0.6415,0.2525,0.2534


In [None]:
rf = tune_model(rf, optimize = 'F1') 

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 13.3min


In [None]:
blender_top3 = blend_models(estimator_list = [qda,lightgbm,rf], method = 'soft')

In [None]:
pred_holdout = predict_model(blender_top3)

In [None]:
final_model = finalize_model(lightgbm)
prediction = predict_model(final_model, data=X_test)

In [None]:
prediction.to_csv(path + 'prediction.csv')

In [None]:
prediction

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,month,day,week,dayofweek,hour,minute,Label,Score
0,1,0,0,1,1,1,1,1,2,1,857,851,1227,4,1,1,263,56,49,1,3,10,2,1,2,1147,839,1,5,263,393790,236865,12,1,49,1,2,24,0,0.5952
1,0,0,0,1,0,0,2,0,2,2,683,1086,662,2,1,1,258,263,49,1,3,5,1,1,2,1611,278,1,4,263,394058,236572,12,17,51,3,5,42,0,0.7370
2,1,0,0,1,1,1,2,3,2,1,514,790,1233,0,1,1,177,170,171,3,3,3,1,1,2,1817,490,3,4,177,1002061,704612,12,10,50,3,23,33,0,0.7228
3,1,0,0,1,1,1,1,2,2,5,114,181,175,4,1,1,177,170,171,3,3,5,1,1,2,101,150,5,3,177,1000813,704652,12,3,49,3,19,44,1,0.7092
4,1,0,0,1,0,0,1,6,4,5,1082,1078,1056,5,1,1,178,177,4,3,1,5,1,1,1,985,1097,1,4,177,111146,704413,12,11,50,4,21,24,1,0.5974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,1,1,1,0,0,0,2,0,4,1,147,46,145,4,1,1,2,4,95,3,3,5,1,1,2,759,147,1,5,91,425040,726084,12,17,51,3,1,6,1,0.6952
46400,1,0,0,1,0,0,2,0,4,1,176,120,159,4,1,1,86,31,278,3,3,10,2,1,2,759,147,1,4,91,290061,156948,12,29,53,1,21,57,1,0.6190
46401,1,1,0,1,1,1,2,0,3,1,145,46,147,5,1,1,288,279,278,3,3,10,2,1,2,759,147,1,4,288,307951,175069,12,10,50,3,19,9,1,0.7283
46402,1,1,0,1,1,1,1,3,3,2,145,46,147,6,1,1,288,279,278,3,1,5,1,3,2,759,147,1,5,288,308354,174849,12,7,50,0,14,4,1,0.6984


In [None]:
submission['target'] = prediction['Label']

In [None]:
submission.to_csv(path + 'prediction.csv', index = False)

In [None]:
submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


## Hyper Parameter tuning

### Grid Search

In [None]:
learn_rate=[0.1,0.01,0.001]
max_depth=[5,10,15,20]
n_estimators=[100,1000,5000,10000]

hyper={'n_estimators':n_estimators,'learning_rate':learn_rate,'max_depth':max_depth}
gd=GridSearchCV(estimator=CatBoostClassifier(n_jobs=-1),param_grid=hyper,verbose=True, n_jobs=-1, cv=3, scoring='f1')
gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_estimator_)

### BayesianOptimization

In [None]:
def CB_opt(n_estimators, depth, learning_rate, max_bin,
             subsample, num_leaves, l2_leaf_reg, model_size_reg): 
  scores = []
  skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 1944)
  for train_index, test_index in skf.split(X_train, y_train):
    
    trainx, valx = X_train.iloc[train_index], X_train.iloc[test_index]
    trainy, valy = y_train.iloc[train_index], y_train.iloc[test_index]
 
    reg = CatBoostClassifier(verbose = 0,
                            n_estimators = int(n_estimators),
                            learning_rate = learning_rate,
                            subsample = subsample, 
                            l2_leaf_reg = l2_leaf_reg,
                            max_depth = int(depth),
                            num_leaves = int(num_leaves),
                            random_state = 88,
                            grow_policy = "Lossguide",
                            max_bin = int(max_bin),  
                            use_best_model = True, 
                            model_size_reg = model_size_reg,
                            eval_metric="F1"
                           )
    
    reg.fit(trainx, trainy, eval_set = (valx, valy))
    scores.append(matthews_corrcoef(valy, reg.predict(valx)))
  return np.mean(scores)

In [None]:
pbounds = {"n_estimators": (500,3000),
           "depth": (2,25),
           "learning_rate": (.01, 0.2),
           "subsample":(0.6, 1.),
           "num_leaves": (16,40),
           "max_bin":(150,300),
           "l2_leaf_reg":(0,10),
           "model_size_reg": (0,10)
}

optimizer = BayesianOptimization(
    f = CB_opt,
    pbounds = pbounds,
    verbose = 2,
    random_state = 888,
)

optimizer.maximize(init_points=5, n_iter = 20, acq='ei', xi=0.01)

|   iter    |  target   |   depth   | l2_lea... | learni... |  max_bin  | model_... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

| [0m 1       [0m | [0m 0.2571  [0m | [0m 21.77   [0m | [0m 1.646   [0m | [0m 0.1019  [0m | [0m 288.2   [0m | [0m 4.286   [0m | [0m 643.7   [0m | [0m 38.2    [0m | [0m 0.863   [0m |
| [95m 2       [0m | [95m 0.2591  [0m | [95m 5.058   [0m | [95m 5.334   [0m | [95m 0.1809  [0m | [95m 187.3   [0m | [95m 0.3017  [0m | [95m 681.1   [0m | [95m 36.98   [0m | [95m 0.8234  [0m |


KeyboardInterrupt: ignored

In [None]:
print(optimizer.max)

##optuna

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive/')

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from tqdm import tnrange, tqdm_notebook, notebook, tqdm
import time
import random

from sklearn import preprocessing
#import optuna # 하이퍼파라미터 튜닝 라이브러리
from sklearn.model_selection import cross_val_score, train_test_split, KFold

from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import f1_score

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
def run(trial):

    # 구하고자하는 변수들의 범위 설정
    n_estimators = trial.suggest_int("n_estimators", 100, 2000)
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    max_depth = trial.suggest_int("max_depth", 1, 10)

    num = random.randrange(1, 1000) # 1부터 1000 사이의 난수 생성

    X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=num)

    model = LGBMClassifier(
        random_state=42,
        # LGBM 파라미터
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
    )
    model.fit(X_tra, y_tra, early_stopping_rounds=300, eval_set=[(X_val, y_val)], verbose=1000)
    preds_valid = model.predict(X_val)
    f1 = f1_score(y_val, preds_valid) # 평가지표 설정
    return f1

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=20)

[32m[I 2021-12-18 15:28:24,963][0m A new study created in memory with name: no-name-04fd2086-a1c8-4c72-b639-d4c877ea9a68[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.656947
Did not meet early stopping. Best iteration is:
[1097]	valid_0's binary_logloss: 0.656463


[32m[I 2021-12-18 15:29:35,863][0m Trial 0 finished with value: 0.6179963429891252 and parameters: {'n_estimators': 1097, 'learning_rate': 0.04577302083100789, 'max_depth': 2}. Best is trial 0 with value: 0.6179963429891252.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.657263
Did not meet early stopping. Best iteration is:
[1879]	valid_0's binary_logloss: 0.653507


[32m[I 2021-12-18 15:32:47,764][0m Trial 1 finished with value: 0.6233386828626067 and parameters: {'n_estimators': 1879, 'learning_rate': 0.029569802979263416, 'max_depth': 2}. Best is trial 1 with value: 0.6233386828626067.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.63825
Did not meet early stopping. Best iteration is:
[1670]	valid_0's binary_logloss: 0.637185


[32m[I 2021-12-18 15:35:02,740][0m Trial 2 finished with value: 0.646217782522495 and parameters: {'n_estimators': 1684, 'learning_rate': 0.10451681258471457, 'max_depth': 7}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.648354
Did not meet early stopping. Best iteration is:
[1428]	valid_0's binary_logloss: 0.646517


[32m[I 2021-12-18 15:37:44,487][0m Trial 3 finished with value: 0.6385812673481182 and parameters: {'n_estimators': 1428, 'learning_rate': 0.010742199141231973, 'max_depth': 8}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[163]	valid_0's binary_logloss: 0.659743


[32m[I 2021-12-18 15:38:07,447][0m Trial 4 finished with value: 0.6230550695189178 and parameters: {'n_estimators': 163, 'learning_rate': 0.017199886017550063, 'max_depth': 7}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.668523
Did not meet early stopping. Best iteration is:
[1876]	valid_0's binary_logloss: 0.665769


[32m[I 2021-12-18 15:39:34,158][0m Trial 5 finished with value: 0.6007058869410916 and parameters: {'n_estimators': 1876, 'learning_rate': 0.01952413585705188, 'max_depth': 1}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[908]	valid_0's binary_logloss: 0.638168


[32m[I 2021-12-18 15:40:47,233][0m Trial 6 finished with value: 0.6449751623996943 and parameters: {'n_estimators': 912, 'learning_rate': 0.14524371535123784, 'max_depth': 6}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[918]	valid_0's binary_logloss: 0.652173


[32m[I 2021-12-18 15:42:20,683][0m Trial 7 finished with value: 0.6273207403530476 and parameters: {'n_estimators': 918, 'learning_rate': 0.013550806843135123, 'max_depth': 5}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[620]	valid_0's binary_logloss: 0.642887


[32m[I 2021-12-18 15:43:10,954][0m Trial 8 finished with value: 0.6406312615689941 and parameters: {'n_estimators': 621, 'learning_rate': 0.08305095786362104, 'max_depth': 4}. Best is trial 2 with value: 0.646217782522495.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.64067
Did not meet early stopping. Best iteration is:
[1931]	valid_0's binary_logloss: 0.638333


[32m[I 2021-12-18 15:45:35,909][0m Trial 9 finished with value: 0.6484093737235569 and parameters: {'n_estimators': 1937, 'learning_rate': 0.06425204697640233, 'max_depth': 10}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.640415
Did not meet early stopping. Best iteration is:
[1452]	valid_0's binary_logloss: 0.639063


[32m[I 2021-12-18 15:47:29,320][0m Trial 10 finished with value: 0.646978516847115 and parameters: {'n_estimators': 1452, 'learning_rate': 0.0610627871106448, 'max_depth': 10}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.639764
Did not meet early stopping. Best iteration is:
[1383]	valid_0's binary_logloss: 0.638604


[32m[I 2021-12-18 15:49:18,881][0m Trial 11 finished with value: 0.6474708171206227 and parameters: {'n_estimators': 1386, 'learning_rate': 0.05792021311984691, 'max_depth': 10}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[435]	valid_0's binary_logloss: 0.640474


[32m[I 2021-12-18 15:50:10,784][0m Trial 12 finished with value: 0.642096345862801 and parameters: {'n_estimators': 1454, 'learning_rate': 0.2294328860980347, 'max_depth': 10}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.643724
Did not meet early stopping. Best iteration is:
[1209]	valid_0's binary_logloss: 0.642796


[32m[I 2021-12-18 15:51:56,215][0m Trial 13 finished with value: 0.6410847200493055 and parameters: {'n_estimators': 1209, 'learning_rate': 0.036026593432219506, 'max_depth': 9}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.640306
Did not meet early stopping. Best iteration is:
[1972]	valid_0's binary_logloss: 0.638213


[32m[I 2021-12-18 15:54:21,794][0m Trial 14 finished with value: 0.6443515444883434 and parameters: {'n_estimators': 1991, 'learning_rate': 0.06984787004602953, 'max_depth': 9}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.640705
Did not meet early stopping. Best iteration is:
[1548]	valid_0's binary_logloss: 0.640078


[32m[I 2021-12-18 15:56:17,352][0m Trial 15 finished with value: 0.6431069672955855 and parameters: {'n_estimators': 1642, 'learning_rate': 0.13372937857396458, 'max_depth': 10}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[600]	valid_0's binary_logloss: 0.64565


[32m[I 2021-12-18 15:57:24,366][0m Trial 16 finished with value: 0.6387914858128471 and parameters: {'n_estimators': 600, 'learning_rate': 0.025423581340048806, 'max_depth': 8}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.646965
Did not meet early stopping. Best iteration is:
[1692]	valid_0's binary_logloss: 0.643918


[32m[I 2021-12-18 15:59:34,628][0m Trial 17 finished with value: 0.6377751497348644 and parameters: {'n_estimators': 1692, 'learning_rate': 0.03970286010577623, 'max_depth': 4}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.640788
Did not meet early stopping. Best iteration is:
[1277]	valid_0's binary_logloss: 0.639777


[32m[I 2021-12-18 16:01:18,295][0m Trial 18 finished with value: 0.6441205791597436 and parameters: {'n_estimators': 1277, 'learning_rate': 0.057088677032402356, 'max_depth': 9}. Best is trial 9 with value: 0.6484093737235569.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.6383
Did not meet early stopping. Best iteration is:
[1617]	valid_0's binary_logloss: 0.636864


[32m[I 2021-12-18 16:03:17,383][0m Trial 19 finished with value: 0.6446642353884635 and parameters: {'n_estimators': 1617, 'learning_rate': 0.0936052426824625, 'max_depth': 8}. Best is trial 9 with value: 0.6484093737235569.[0m


In [None]:
study.best_params

{'learning_rate': 0.06425204697640233, 'max_depth': 10, 'n_estimators': 1937}

In [None]:
para = study.best_params

model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        ** para
    )

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.6172961468596128

In [None]:
from optuna import Trial
from optuna.samplers import *

study = optuna.create_study(direction="maximize", sampler=TPESampler())
study.optimize(run, n_trials=20)

[32m[I 2021-12-18 16:16:43,785][0m A new study created in memory with name: no-name-e3a4958c-2913-4052-bc22-57789df00d15[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[778]	valid_0's binary_logloss: 0.64106


[32m[I 2021-12-18 16:17:44,571][0m Trial 0 finished with value: 0.6418082541618624 and parameters: {'n_estimators': 778, 'learning_rate': 0.11676717063658815, 'max_depth': 4}. Best is trial 0 with value: 0.6418082541618624.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.639327
Early stopping, best iteration is:
[1250]	valid_0's binary_logloss: 0.638877


[32m[I 2021-12-18 16:19:37,444][0m Trial 1 finished with value: 0.6438403172303352 and parameters: {'n_estimators': 1843, 'learning_rate': 0.10526241755657029, 'max_depth': 9}. Best is trial 1 with value: 0.6438403172303352.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.648142
Did not meet early stopping. Best iteration is:
[1294]	valid_0's binary_logloss: 0.646531


[32m[I 2021-12-18 16:21:56,119][0m Trial 2 finished with value: 0.6372263050231077 and parameters: {'n_estimators': 1294, 'learning_rate': 0.013173318105807855, 'max_depth': 6}. Best is trial 1 with value: 0.6438403172303352.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.641114
Did not meet early stopping. Best iteration is:
[1691]	valid_0's binary_logloss: 0.638807


[32m[I 2021-12-18 16:24:25,396][0m Trial 3 finished with value: 0.6455675701424285 and parameters: {'n_estimators': 1698, 'learning_rate': 0.06265203286433035, 'max_depth': 5}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[945]	valid_0's binary_logloss: 0.661809


[32m[I 2021-12-18 16:25:09,643][0m Trial 4 finished with value: 0.6047099792807603 and parameters: {'n_estimators': 945, 'learning_rate': 0.1893756145205277, 'max_depth': 1}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[175]	valid_0's binary_logloss: 0.649035


[32m[I 2021-12-18 16:25:34,020][0m Trial 5 finished with value: 0.6351702059141109 and parameters: {'n_estimators': 175, 'learning_rate': 0.045240841889378124, 'max_depth': 9}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[181]	valid_0's binary_logloss: 0.65434


[32m[I 2021-12-18 16:25:59,821][0m Trial 6 finished with value: 0.6289893366046994 and parameters: {'n_estimators': 181, 'learning_rate': 0.0326594668797057, 'max_depth': 10}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[716]	valid_0's binary_logloss: 0.645613


[32m[I 2021-12-18 16:27:21,572][0m Trial 7 finished with value: 0.638030303030303 and parameters: {'n_estimators': 716, 'learning_rate': 0.022247324671014014, 'max_depth': 10}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.66879
Did not meet early stopping. Best iteration is:
[1832]	valid_0's binary_logloss: 0.665984


[32m[I 2021-12-18 16:28:45,898][0m Trial 8 finished with value: 0.6004441241210043 and parameters: {'n_estimators': 1832, 'learning_rate': 0.016179549023100724, 'max_depth': 1}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.638908
Early stopping, best iteration is:
[714]	valid_0's binary_logloss: 0.638625


[32m[I 2021-12-18 16:30:02,734][0m Trial 9 finished with value: 0.6440648838563696 and parameters: {'n_estimators': 1051, 'learning_rate': 0.20595575622101112, 'max_depth': 6}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.646636
Did not meet early stopping. Best iteration is:
[1446]	valid_0's binary_logloss: 0.644638


[32m[I 2021-12-18 16:31:39,213][0m Trial 10 finished with value: 0.638008447167155 and parameters: {'n_estimators': 1446, 'learning_rate': 0.07463099410514616, 'max_depth': 3}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[414]	valid_0's binary_logloss: 0.641895


[32m[I 2021-12-18 16:32:34,304][0m Trial 11 finished with value: 0.6412551774159683 and parameters: {'n_estimators': 1419, 'learning_rate': 0.24561133551400355, 'max_depth': 6}. Best is trial 3 with value: 0.6455675701424285.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.638985
Did not meet early stopping. Best iteration is:
[1150]	valid_0's binary_logloss: 0.638456


[32m[I 2021-12-18 16:34:07,169][0m Trial 12 finished with value: 0.6467971868466071 and parameters: {'n_estimators': 1151, 'learning_rate': 0.06598057876494431, 'max_depth': 7}. Best is trial 12 with value: 0.6467971868466071.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.639759
Did not meet early stopping. Best iteration is:
[1610]	valid_0's binary_logloss: 0.637966


[32m[I 2021-12-18 16:36:10,551][0m Trial 13 finished with value: 0.647569262937794 and parameters: {'n_estimators': 1611, 'learning_rate': 0.06294795286579005, 'max_depth': 8}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.640149
Did not meet early stopping. Best iteration is:
[1174]	valid_0's binary_logloss: 0.63946


[32m[I 2021-12-18 16:37:50,833][0m Trial 14 finished with value: 0.64535180873417 and parameters: {'n_estimators': 1174, 'learning_rate': 0.04156359369600465, 'max_depth': 8}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.638814
Did not meet early stopping. Best iteration is:
[1512]	valid_0's binary_logloss: 0.638036


[32m[I 2021-12-18 16:39:42,880][0m Trial 15 finished with value: 0.6456329210566499 and parameters: {'n_estimators': 1560, 'learning_rate': 0.11633706918096447, 'max_depth': 8}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.643476
Did not meet early stopping. Best iteration is:
[1976]	valid_0's binary_logloss: 0.639866


[32m[I 2021-12-18 16:42:28,441][0m Trial 16 finished with value: 0.6455657724751122 and parameters: {'n_estimators': 1976, 'learning_rate': 0.029064272370530952, 'max_depth': 7}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
Did not meet early stopping. Best iteration is:
[527]	valid_0's binary_logloss: 0.641158


[32m[I 2021-12-18 16:43:15,752][0m Trial 17 finished with value: 0.6446882282726084 and parameters: {'n_estimators': 527, 'learning_rate': 0.07334916830804662, 'max_depth': 8}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.639196
Did not meet early stopping. Best iteration is:
[1588]	valid_0's binary_logloss: 0.636983


[32m[I 2021-12-18 16:45:20,763][0m Trial 18 finished with value: 0.6473801412230448 and parameters: {'n_estimators': 1590, 'learning_rate': 0.05934973921071716, 'max_depth': 7}. Best is trial 13 with value: 0.647569262937794.[0m


Training until validation scores don't improve for 300 rounds.
[1000]	valid_0's binary_logloss: 0.64785
Did not meet early stopping. Best iteration is:
[1607]	valid_0's binary_logloss: 0.644642


[32m[I 2021-12-18 16:47:29,219][0m Trial 19 finished with value: 0.6363619047619048 and parameters: {'n_estimators': 1607, 'learning_rate': 0.02786887668911511, 'max_depth': 4}. Best is trial 13 with value: 0.647569262937794.[0m


In [None]:
study.best_params

{'learning_rate': 0.06294795286579005, 'max_depth': 8, 'n_estimators': 1611}

In [None]:
para = study.best_params

model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        ** para
    )

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

0.6170931536618904

In [None]:
from optuna.integration import SkoptSampler

study = optuna.create_study(direction="maximize", sampler=SkoptSampler())
study.optimize(run, n_trials=20)

ImportError: ignored

In [None]:
study.best_params

In [None]:
para = study.best_params

model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        ** para
    )

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()

In [None]:
from optuna.integration import SkoptSampler

sampler=SkoptSampler(skopt_kwargs={'base_estimator':'RF', 'base_estimator':'ET', 'n_random_starts': 0, 'acq_func':'EI', 'acq_func_kwargs': {'xi':0.01}})

study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(run, n_trials=20)

ImportError: ignored

In [None]:
study.best_params

In [None]:
para = study.best_params

model = LGBMClassifier(
        random_state=42,
        n_jobs=-1,
        ** para
    )

cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1', n_jobs=-1).mean()