In [1]:
import pandas as pd
import os
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb

import gc
from numba import jit

random_seed = 33

train_file = 'train_1225.csv'    
write_path = './BayesOpt_result/'

model_type = 'catboost'
# model_type = 'lgb'
# model_type = 'xgb'

if model_type == 'catboost':
    record_file = 'catboost_result.txt'
elif model_type == 'lgb':
    record_file = 'lgb_result.txt'
elif model_type == 'xgb':
    record_file = 'xgb_result.txt'
else:
    pass

iteration = 150
init_it = 50
category_list = []


In [2]:
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    y_pred[y_pred <= 1.12232214] = 0
    y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
    y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
    y_pred[y_pred > 2.22506454] = 3

    # y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)

    return 'cappa', qwk(y_true, y_pred), True


In [3]:
train = pd.read_csv('../data/preprocess/'+train_file)
cols_to_drop = ['game_session', 'installation_id', 'timestamp', 'accuracy_group', 'timestampDate']
y = train['accuracy_group'].copy()
for c in cols_to_drop:
    if c in train.columns:
        train = train.drop(columns = c)
X = train

## 驗證集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_seed, stratify=y)
     
categorical_features_indices = np.where(X_train.columns.isin(category_list))[0]
print(X_train.dtypes[categorical_features_indices])

## catboost 調參
## https://catboost.ai/docs/concepts/loss-functions-regression.html
param_cat={
    'loss_function':'RMSE', ##MAE
    'eval_metric':'RMSE',
    
    'iterations':14000,
    'random_seed':random_seed,
    'thread_count':5,
    'task_type':"GPU",
    'devices':'0:1',
#     'boosting_type':'Ordered',
    'learning_rate':0.03,
    'l2_leaf_reg':20,#20
    'depth':7,
    'bagging_temperature':0.3,
    'random_strength':10,
    # 'rsm':0.8,

    # 'fold_permutation_block':1,
    # 'feature_border_type':'MinEntropy',
    # 'boosting_type':'Ordered',
    # 'leaf_estimation_backtracking':'Armijo',
    
    'one_hot_max_size':200,
#     'grow_policy':'Lossguide',
#     'grow_policy':'Lossguide',
}
param_range_cat={
#     'depth':(5,16.9),
#     'max_leaves':(20,45),
    'depth':(5,12.9),
    'max_leaves':(31,31.5),
    'l2_leaf_reg':(1,100),
    'bagging_temperature':(0.01,5)  
}

## LGB調參
## https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor
## https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

param_lgb = {'n_estimators':10000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'feature_fraction': 0.9,
            'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'early_stopping_rounds': 400, 'eval_metric': 'cappa'
            }

param_range_lgb={
    'max_depth': (5,16),
    'num_leaves': (15,100),
    'feature_fraction': (0.3,1),
    
    'subsample':(0.2,1),
    'subsample_freq':(1,3),
    'lambda_l1':(0.1,50),
    'lambda_l2':(0.1,50),
}


param_clf = {}
param_range = {}

if model_type == 'catboost':
    param_clf = param_cat
    param_range = param_range_cat
elif model_type == 'lgb':
    param_clf = param_lgb
    param_range = param_range_lgb
elif model_type == 'xgb':
    param_clf = param_lgb
    param_range = param_range_lgb
else:
    pass

def bys_train_catboost(depth,max_leaves,l2_leaf_reg,bagging_temperature):
    param_clf['depth']=int(depth)
    param_clf['max_leaves']=int(max_leaves)
    param_clf['l2_leaf_reg']=l2_leaf_reg
    param_clf['bagging_temperature']=bagging_temperature
    
    model = CatBoostRegressor(**param_clf)
    model.fit(X_train, y_train,
    cat_features=categorical_features_indices,    
    eval_set=(X_test, y_test),
    early_stopping_rounds=200,
    verbose=50) 
     
    score_max = model.get_best_score()['validation']['RMSE']
    
    print(int(depth),int(max_leaves),l2_leaf_reg,bagging_temperature)
    print(score_max)
    
    with open(write_path+record_file,'a') as f:
        print('depth',int(depth),file=f)
        print('max_leaves',int(max_leaves),file=f)
        print('l2_leaf_reg',l2_leaf_reg,file=f)
        print('bagging_temperature',bagging_temperature,file=f)
        print(score_max,file=f)
        print('',file=f)
    
    return -score_max

def bys_train_lgb(max_depth,num_leaves,feature_fraction,subsample,subsample_freq,lambda_l1,lambda_l2):
    param_clf['max_depth']=int(max_depth)
    param_clf['num_leaves']=int(num_leaves)
    param_clf['feature_fraction']=feature_fraction
    param_clf['subsample']=subsample
    param_clf['subsample_freq']=int(subsample_freq)
    param_clf['lambda_l1']=lambda_l1
    param_clf['lambda_l2']=lambda_l2
        
    model = lgb.LGBMRegressor(**param_clf)
    model.fit(X=X_train, y=y_train,
              eval_set=[(X_train, y_train),(X_test, y_test)], eval_metric=eval_qwk_lgb_regr,
              verbose=param_clf['verbose'], early_stopping_rounds=param_clf['early_stopping_rounds'],
              categorical_feature=categorical_features_indices)

    score_max = model.best_score_['valid_1']['rmse']
    
    print("parameter",int(max_depth),int(num_leaves),feature_fraction,subsample,int(subsample_freq),lambda_l1,lambda_l2)
    print("score",score_max)
    
    with open(write_path+record_file,'a') as f:
        print('max_depth',int(max_depth),file=f)
        print('num_leaves',int(num_leaves),file=f)
        print('feature_fraction',feature_fraction,file=f)
        print('subsample',subsample,file=f)
        print('subsample_freq',int(subsample_freq),file=f)
        print('lambda_l1',lambda_l1,file=f)
        print('lambda_l2',lambda_l2,file=f)
        print(score_max,file=f)
        print('',file=f)
    
    return -score_max

def main(write_path='./BayesOpt_result/',record_file='Bayes_result.txt',iteration=5,init_it=5):
    print(iteration)
    with open(write_path+record_file,'a') as f:
        print('\n{}'.format(train_file),file=f)
        
    if model_type == 'catboost':
        Bys_opt = BayesianOptimization(bys_train_catboost,param_range)#, acq="ucb", kappa=1) 
    elif model_type == 'lgb':
        Bys_opt = BayesianOptimization(bys_train_lgb,param_range)#, acq="ucb", kappa=1) 
    elif model_type == 'xgb':
        Bys_opt = BayesianOptimization(bys_train_lgb,param_range)#, acq="ucb", kappaand=1) 
    else:
        pass
    
    Bys_opt.maximize(n_iter=iteration, init_points=init_it)
    print(Bys_opt.max)
    
    with open(write_path+record_file,'a') as f:
        print('Max para',Bys_opt.max,file=f)

main(write_path=write_path, record_file=record_file, iteration=iteration,init_it=init_it)


Series([], dtype: object)
150
|   iter    |  target   | baggin... |   depth   | l2_lea... | max_le... |
-------------------------------------------------------------------------




0:	learn: 1.2441130	test: 1.2434949	best: 1.2434949 (0)	total: 67.5ms	remaining: 15m 45s
50:	learn: 0.8993433	test: 0.8915629	best: 0.8915629 (50)	total: 11.9s	remaining: 54m 9s
100:	learn: 0.7460017	test: 0.7407709	best: 0.7407709 (100)	total: 21.5s	remaining: 49m 17s
150:	learn: 0.6723872	test: 0.6698581	best: 0.6698581 (150)	total: 34.4s	remaining: 52m 31s
200:	learn: 0.6235581	test: 0.6233770	best: 0.6233770 (200)	total: 43.8s	remaining: 50m 5s
250:	learn: 0.5886462	test: 0.5911063	best: 0.5911063 (250)	total: 54.8s	remaining: 50m 3s
300:	learn: 0.5633263	test: 0.5682644	best: 0.5682644 (300)	total: 57.2s	remaining: 43m 24s
350:	learn: 0.5406572	test: 0.5479775	best: 0.5479775 (350)	total: 59.7s	remaining: 38m 40s
400:	learn: 0.5198062	test: 0.5296231	best: 0.5296231 (400)	total: 1m 2s	remaining: 35m 10s
450:	learn: 0.4976884	test: 0.5113368	best: 0.5113368 (450)	total: 1m 4s	remaining: 32m 31s
500:	learn: 0.4810102	test: 0.4974660	best: 0.4974660 (500)	total: 1m 7s	remaining: 30m 

4400:	learn: 0.2861195	test: 0.3965572	best: 0.3965572 (4400)	total: 4m 42s	remaining: 10m 16s
4450:	learn: 0.2853916	test: 0.3963908	best: 0.3963773 (4443)	total: 4m 45s	remaining: 10m 11s
4500:	learn: 0.2848254	test: 0.3963794	best: 0.3963631 (4495)	total: 4m 47s	remaining: 10m 7s
4550:	learn: 0.2842154	test: 0.3961664	best: 0.3961664 (4550)	total: 4m 50s	remaining: 10m 2s
4600:	learn: 0.2836768	test: 0.3960831	best: 0.3960685 (4597)	total: 4m 52s	remaining: 9m 58s
4650:	learn: 0.2830650	test: 0.3960007	best: 0.3960007 (4650)	total: 4m 55s	remaining: 9m 53s
4700:	learn: 0.2825128	test: 0.3958660	best: 0.3958660 (4700)	total: 4m 57s	remaining: 9m 49s
4750:	learn: 0.2818520	test: 0.3957631	best: 0.3957631 (4750)	total: 5m	remaining: 9m 44s
4800:	learn: 0.2813197	test: 0.3957705	best: 0.3956921 (4763)	total: 5m 2s	remaining: 9m 40s
4850:	learn: 0.2806911	test: 0.3956542	best: 0.3956400 (4848)	total: 5m 5s	remaining: 9m 36s
4900:	learn: 0.2799761	test: 0.3954342	best: 0.3954137 (4898)	to

KeyboardInterrupt: 

## 原理理解
## https://github.com/fmfn/BayesianOptimization/blob/master/examples/visualization.ipynb


#### init step越多，越不容易卡在local mini
#### Kappa決定confidence interval範圍
#### 不了解kappa, GP的過程 唉