In [1]:
import pandas as pd
import os
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb

import gc
from numba import jit

random_seed = 33

train_file = 'train895.csv'    
write_path = './BayesOpt_result/'

# model_type = 'catboost'
model_type = 'lgb'
# model_type = 'xgb'

if model_type == 'catboost':
    record_file = 'catboost_result.txt'
elif model_type == 'lgb':
    record_file = 'lgb_result.txt'
elif model_type == 'xgb':
    record_file = 'xgb_result.txt'
else:
    pass

iteration = 150
init_it = 50
category_list = []


In [2]:
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    y_pred[y_pred <= 1.12232214] = 0
    y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
    y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
    y_pred[y_pred > 2.22506454] = 3

    # y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)

    return 'cappa', qwk(y_true, y_pred), True


In [None]:
train = pd.read_csv('../data/preprocess/'+train_file)
cols_to_drop = ['game_session', 'installation_id', 'timestamp', 'accuracy_group', 'timestampDate']
y = train['accuracy_group'].copy()
for c in cols_to_drop:
    if c in train.columns:
        train = train.drop(columns = c)
X = train

## 驗證集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_seed, stratify=y)
     
categorical_features_indices = np.where(X_train.columns.isin(category_list))[0]
print(X_train.dtypes[categorical_features_indices])

## catboost 調參
## https://catboost.ai/docs/concepts/loss-functions-regression.html
param_cat={
    'loss_function':'RMSE', ##MAE
    'eval_metric':'RMSE',
    
    'iterations':14000,
    'random_seed':random_seed,
    'thread_count':1,
    'task_type':"GPU",
    'devices':'0:1',
#     'boosting_type':'Ordered',
    'learning_rate':0.03,
    'l2_leaf_reg':20,#20
    'depth':7,
    'bagging_temperature':0.3,
    'random_strength':10,
    # 'rsm':0.8,

    # 'fold_permutation_block':1,
    # 'feature_border_type':'MinEntropy',
    # 'boosting_type':'Ordered',
    # 'leaf_estimation_backtracking':'Armijo',
    
    'one_hot_max_size':200,
    'grow_policy':'Lossguide',
}
param_range_cat={
    'depth':(5,16.9),
    'max_leaves':(20,45),
    'l2_leaf_reg':(1,100),
    'bagging_temperature':(0.01,5)  
}

## LGB調參
## https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor
## https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

param_lgb = {'n_estimators':10000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.04,
            'feature_fraction': 0.9,
            'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'early_stopping_rounds': 400, 'eval_metric': 'cappa'
            }

param_range_lgb={
    'max_depth': (5,16),
    'num_leaves': (15,100),
    'feature_fraction': (0.3,1),
    
    'subsample':(0.2,1),
    'subsample_freq':(1,3),
    'lambda_l1':(0.1,50),
    'lambda_l2':(0.1,50),
}


param_clf = {}
param_range = {}

if model_type == 'catboost':
    param_clf = param_cat
    param_range = param_range_cat
elif model_type == 'lgb':
    param_clf = param_lgb
    param_range = param_range_lgb
elif model_type == 'xgb':
    param_clf = param_lgb
    param_range = param_range_lgb
else:
    pass

def bys_train_catboost(depth,max_leaves,l2_leaf_reg,bagging_temperature):
    param_clf['depth']=int(depth)
    param_clf['max_leaves']=int(max_leaves)
    param_clf['l2_leaf_reg']=l2_leaf_reg
    param_clf['bagging_temperature']=bagging_temperature
    
    model = CatBoostRegressor(**param_clf)
    model.fit(X_train, y_train,
    cat_features=categorical_features_indices,    
    eval_set=(X_test, y_test),
    early_stopping_rounds=1000,
    verbose=50) 
     
    score_max = model.get_best_score()['validation']['RMSE']
    
    print(int(depth),int(max_leaves),l2_leaf_reg,bagging_temperature)
    print(score_max)
    
    with open(write_path+record_file,'a') as f:
        print('depth',int(depth),file=f)
        print('max_leaves',int(max_leaves),file=f)
        print('l2_leaf_reg',l2_leaf_reg,file=f)
        print('bagging_temperature',bagging_temperature,file=f)
        print(score_max,file=f)
        print('',file=f)
    
    return -score_max

def bys_train_lgb(max_depth,num_leaves,feature_fraction,subsample,subsample_freq,lambda_l1,lambda_l2):
    param_clf['max_depth']=int(max_depth)
    param_clf['num_leaves']=int(num_leaves)
    param_clf['feature_fraction']=feature_fraction
    param_clf['subsample']=subsample
    param_clf['subsample_freq']=int(subsample_freq)
    param_clf['lambda_l1']=lambda_l1
    param_clf['lambda_l2']=lambda_l2
        
    model = lgb.LGBMRegressor(**param_clf)
    model.fit(X=X_train, y=y_train,
              eval_set=[(X_train, y_train),(X_test, y_test)], eval_metric=eval_qwk_lgb_regr,
              verbose=param_clf['verbose'], early_stopping_rounds=param_clf['early_stopping_rounds'],
              categorical_feature=categorical_features_indices)

    score_max = model.best_score_['valid_1']['rmse']
    
    print("parameter",int(max_depth),int(num_leaves),feature_fraction,subsample,int(subsample_freq),lambda_l1,lambda_l2)
    print("score",score_max)
    
    with open(write_path+record_file,'a') as f:
        print('max_depth',int(max_depth),file=f)
        print('num_leaves',int(num_leaves),file=f)
        print('feature_fraction',feature_fraction,file=f)
        print('subsample',subsample,file=f)
        print('subsample_freq',int(subsample_freq),file=f)
        print('lambda_l1',lambda_l1,file=f)
        print('lambda_l2',lambda_l2,file=f)
        print(score_max,file=f)
        print('',file=f)
    
    return -score_max

def main(write_path='./BayesOpt_result/',record_file='Bayes_result.txt',iteration=5,init_it=5):
    print(iteration)
    with open(write_path+record_file,'a') as f:
        print('\n{}'.format(train_file),file=f)
        
    if model_type == 'catboost':
        Bys_opt = BayesianOptimization(bys_train_catboost,param_range)#, acq="ucb", kappa=1) 
    elif model_type == 'lgb':
        Bys_opt = BayesianOptimization(bys_train_lgb,param_range)#, acq="ucb", kappa=1) 
    elif model_type == 'xgb':
        Bys_opt = BayesianOptimization(bys_train_lgb,param_range)#, acq="ucb", kappaand=1) 
    else:
        pass
    
    Bys_opt.maximize(n_iter=iteration, init_points=init_it)
    print(Bys_opt.max)
    
    with open(write_path+record_file,'a') as f:
        print('Max para',Bys_opt.max,file=f)

main(write_path=write_path, record_file=record_file, iteration=iteration,init_it=init_it)


Series([], dtype: object)
150
|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | max_depth | num_le... | subsample | subsam... |
-------------------------------------------------------------------------------------------------------------


  if self.categorical_feature == categorical_feature:
  elif categorical_feature == 'auto':
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
  if categorical_feature == 'auto':  # use cat cols from DataFrame
Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: Invalid use of Function(<function asarray at 0x7fe7ec233048>) with argument(s) of type(s): (array(float32, 1d, C), dtype=Function(<class 'int'>))
 * parameterized
In definition 0:
    AttributeError: 'Function' object has no attribute 'dtype'
    raised from /home/jupyter-shliu/.local/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
In definition 1:
    AttributeError: 'Function' object has no attribute 'dtype'
    raised from /home/jupyter-shliu/.local/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
This error is usually caused by passing an argument of a type that is unsupported by

Training until validation scores don't improve for 400 rounds
[100]	training's rmse: 0.941444	training's cappa: 0.647882	valid_1's rmse: 0.991362	valid_1's cappa: 0.59634
[200]	training's rmse: 0.883198	training's cappa: 0.698275	valid_1's rmse: 0.990779	valid_1's cappa: 0.592806
[300]	training's rmse: 0.840723	training's cappa: 0.735971	valid_1's rmse: 0.989665	valid_1's cappa: 0.594965
[400]	training's rmse: 0.806154	training's cappa: 0.762729	valid_1's rmse: 0.994067	valid_1's cappa: 0.59345
[500]	training's rmse: 0.775602	training's cappa: 0.787232	valid_1's rmse: 0.998202	valid_1's cappa: 0.585582
Early stopping, best iteration is:
[142]	training's rmse: 0.913951	training's cappa: 0.672138	valid_1's rmse: 0.990058	valid_1's cappa: 0.597985
parameter 13 84 0.9898951257955306 0.23343323552047243 1 4.5618640500551795 43.0822087640565
score 0.9900579642517305
| [0m 1       [0m | [0m-0.9901  [0m | [0m 0.9899  [0m | [0m 4.562   [0m | [0m 43.08   [0m | [0m 13.33   [0m | [0m 

  if self.categorical_feature == categorical_feature:
  elif categorical_feature == 'auto':
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 400 rounds
[100]	training's rmse: 0.982377	training's cappa: 0.611155	valid_1's rmse: 1.00219	valid_1's cappa: 0.587625
[200]	training's rmse: 0.947126	training's cappa: 0.639105	valid_1's rmse: 0.994355	valid_1's cappa: 0.5915
[300]	training's rmse: 0.92373	training's cappa: 0.661363	valid_1's rmse: 0.991744	valid_1's cappa: 0.595895
[400]	training's rmse: 0.905008	training's cappa: 0.680249	valid_1's rmse: 0.992637	valid_1's cappa: 0.595451
[500]	training's rmse: 0.889225	training's cappa: 0.69461	valid_1's rmse: 0.993684	valid_1's cappa: 0.59384
[600]	training's rmse: 0.874789	training's cappa: 0.70762	valid_1's rmse: 0.994002	valid_1's cappa: 0.593467
[700]	training's rmse: 0.862264	training's cappa: 0.719162	valid_1's rmse: 0.994371	valid_1's cappa: 0.59547
Early stopping, best iteration is:
[312]	training's rmse: 0.92124	training's cappa: 0.663425	valid_1's rmse: 0.991461	valid_1's cappa: 0.598138
parameter 9 38 0.888423251400118

Training until validation scores don't improve for 400 rounds
[100]	training's rmse: 0.924922	training's cappa: 0.662516	valid_1's rmse: 0.986511	valid_1's cappa: 0.596288
[200]	training's rmse: 0.868385	training's cappa: 0.709971	valid_1's rmse: 0.982279	valid_1's cappa: 0.603096
[300]	training's rmse: 0.825147	training's cappa: 0.744628	valid_1's rmse: 0.982878	valid_1's cappa: 0.604607
[400]	training's rmse: 0.79004	training's cappa: 0.774731	valid_1's rmse: 0.986076	valid_1's cappa: 0.599349
[500]	training's rmse: 0.759738	training's cappa: 0.797118	valid_1's rmse: 0.988732	valid_1's cappa: 0.598577
[600]	training's rmse: 0.733546	training's cappa: 0.81365	valid_1's rmse: 0.991091	valid_1's cappa: 0.598914
Early stopping, best iteration is:
[251]	training's rmse: 0.844691	training's cappa: 0.729347	valid_1's rmse: 0.981909	valid_1's cappa: 0.607967
parameter 6 81 0.66587459341075 0.7464360071368126 1 6.609771477330029 14.246695795610785
score 0.9819086600003838
| [95m 9       [0m

[300]	training's rmse: 0.962803	training's cappa: 0.625809	valid_1's rmse: 0.996801	valid_1's cappa: 0.592882
[400]	training's rmse: 0.952379	training's cappa: 0.633675	valid_1's rmse: 0.994313	valid_1's cappa: 0.593846
[500]	training's rmse: 0.943909	training's cappa: 0.641665	valid_1's rmse: 0.993267	valid_1's cappa: 0.595245
[600]	training's rmse: 0.936682	training's cappa: 0.649039	valid_1's rmse: 0.993329	valid_1's cappa: 0.594714
[700]	training's rmse: 0.930047	training's cappa: 0.655768	valid_1's rmse: 0.992773	valid_1's cappa: 0.595933
[800]	training's rmse: 0.923954	training's cappa: 0.66301	valid_1's rmse: 0.992412	valid_1's cappa: 0.59383
[900]	training's rmse: 0.918755	training's cappa: 0.667199	valid_1's rmse: 0.99254	valid_1's cappa: 0.594674
[1000]	training's rmse: 0.913688	training's cappa: 0.672459	valid_1's rmse: 0.9928	valid_1's cappa: 0.595211
[1100]	training's rmse: 0.909611	training's cappa: 0.675872	valid_1's rmse: 0.992437	valid_1's cappa: 0.595408
[1200]	traini

[900]	training's rmse: 0.889549	training's cappa: 0.693966	valid_1's rmse: 0.989014	valid_1's cappa: 0.601016
Early stopping, best iteration is:
[573]	training's rmse: 0.91275	training's cappa: 0.673979	valid_1's rmse: 0.990609	valid_1's cappa: 0.6027
parameter 9 32 0.8628754636880334 0.3976674095008604 2 49.46395434908197 33.888706586612805
score 0.9906089732043881
| [0m 20      [0m | [0m-0.9906  [0m | [0m 0.8629  [0m | [0m 49.46   [0m | [0m 33.89   [0m | [0m 9.8     [0m | [0m 32.95   [0m | [0m 0.3977  [0m | [0m 2.18    [0m |
Training until validation scores don't improve for 400 rounds
[100]	training's rmse: 0.929978	training's cappa: 0.659518	valid_1's rmse: 0.986128	valid_1's cappa: 0.601181
[200]	training's rmse: 0.875252	training's cappa: 0.703783	valid_1's rmse: 0.983092	valid_1's cappa: 0.603944
[300]	training's rmse: 0.836977	training's cappa: 0.736217	valid_1's rmse: 0.984057	valid_1's cappa: 0.601298
[400]	training's rmse: 0.80601	training's cappa: 0.760127

## 原理理解
## https://github.com/fmfn/BayesianOptimization/blob/master/examples/visualization.ipynb


#### init step越多，越不容易卡在local mini
#### Kappa決定confidence interval範圍
#### 不了解kappa, GP的過程 唉