In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/'My Drive'/'kaggle'/'LinkingWritingProcess'

/content/drive/My Drive/kaggle/LinkingWritingProcess


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


## (1) Imports

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

import re
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import datetime as dt
from tqdm.auto import tqdm
tqdm.pandas()

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor,GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.ensemble import AdaBoostRegressor

from scipy.optimize import minimize

./data/sample_submission.csv
./data/test_logs.csv
./data/train_logs.csv
./data/train_scores.csv
./data/Essay Constructor Result.csv


## (2) Load data

In [None]:
# 訓練データ読み込み
train_logs = pd.read_csv("./data/train_logs.csv")
train_scores = pd.read_csv("./data/train_scores.csv")
# 検証データ読み込み
test_logs = pd.read_csv("./data/test_logs.csv")

print("train_df： ", train_logs.shape)
print("train_scores: ", train_scores.shape)
print("test_df: ", test_logs.shape)

train_df：  (8405898, 11)
train_scores:  (2471, 2)
test_df:  (6, 11)


## (3) Helper functions

In [None]:
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',',
          'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']


def activity_counts(df):
    tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
    ret = list()
    for li in tqdm(tmp_df['activity'].values):
        items = list(Counter(li).items())
        di = dict()
        for k in activities:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def event_counts(df, colname):
    tmp_df = df.groupby('id').agg({colname: list}).reset_index()
    ret = list()
    for li in tqdm(tmp_df[colname].values):
        items = list(Counter(li).items())
        di = dict()
        for k in events:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


def text_change_counts(df):
    tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
    ret = list()
    for li in tqdm(tmp_df['text_change'].values):
        items = list(Counter(li).items())
        di = dict()
        for k in text_changes:
            di[k] = 0
        for item in items:
            k, v = item[0], item[1]
            if k in di:
                di[k] = v
        ret.append(di)
    ret = pd.DataFrame(ret)
    cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
    ret.columns = cols
    return ret


punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
def match_punctuations(df):
    tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
    ret = list()
    for li in tqdm(tmp_df['down_event'].values):
        cnt = 0
        items = list(Counter(li).items())
        for item in items:
            k, v = item[0], item[1]
            if k in punctuations:
                cnt += v
        ret.append(cnt)
    ret = pd.DataFrame({'punct_cnt': ret})
    return ret


def get_input_words(df):
    tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
    tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
    tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
    tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
    tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
    tmp_df.drop(['text_change'], axis=1, inplace=True)
    return tmp_df

## (4) Make features

In [None]:
def make_feats(df):

    # id
    feats = pd.DataFrame({'id': df['id'].unique().tolist()})

    # time shift
    df['up_time_shift1'] = df.groupby('id')['up_time'].shift(1)
    df['action_time_gap'] = df['down_time'] - df['up_time_shift1']
    df.drop('up_time_shift1', axis=1, inplace=True)

    # cursor position shift
    df['cursor_position_shift1'] = df.groupby('id')['cursor_position'].shift(1)
    df['cursor_position_change'] = np.abs(df['cursor_position'] - df['cursor_position_shift1'])
    df.drop('cursor_position_shift1', axis=1, inplace=True)

    # word count shift
    df['word_count_shift1'] = df.groupby('id')['word_count'].shift(1)
    df['word_count_change'] = np.abs(df['word_count'] - df['word_count_shift1'])
    df.drop('word_count_shift1', axis=1, inplace=True)

    # stats feats
    for item in tqdm([
        ('event_id', ['max']),
        ('up_time', ['max']),
        ('action_time', ['sum', 'max', 'mean', 'std']),
        ('activity', ['nunique']),
        ('down_event', ['nunique']),
        ('up_event', ['nunique']),
        ('text_change', ['nunique']),
        ('cursor_position', ['nunique', 'max', 'mean']),
        ('word_count', ['nunique', 'max', 'mean']),
        ('action_time_gap', ['max', 'min', 'mean', 'std', 'sum']),
        ('cursor_position_change', ['max', 'mean', 'std', 'sum']),
        ('word_count_change', ['max', 'mean', 'std', 'sum'])
    ]):
        colname, methods = item[0], item[1]
        for method in methods:
            tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method}'})
            feats = feats.merge(tmp_df, on='id', how='left')

    # counts
    tmp_df = activity_counts(df)
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = event_counts(df, 'down_event')
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = event_counts(df, 'up_event')
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = text_change_counts(df)
    feats = pd.concat([feats, tmp_df], axis=1)
    tmp_df = match_punctuations(df)
    feats = pd.concat([feats, tmp_df], axis=1)

    # input words
    tmp_df = get_input_words(df)
    feats = pd.merge(feats, tmp_df, on='id', how='left')

    # compare feats
    feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
    feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
    feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
    feats['idle_time_ratio'] = feats['action_time_gap_sum'] / feats['up_time_max']

    return feats

In [None]:
train_feats = make_feats(train_logs)
test_feats = make_feats(test_logs)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/2471 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
train_feats = train_feats.merge(train_scores, on='id', how='left')
print('Train : ', train_feats.shape, ' Test : ', test_feats.shape)

Train :  (2471, 92)  Test :  (3, 91)


In [None]:
display(train_feats.head())
display(test_feats.head())

Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,...,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio,score
0,001519c8,2557,1801969,297243,2259,116.246774,91.797374,7,12,12,...,37,366,5.325137,20,3.487804,0.000142,0.100117,0.001419,0.832534,3.5
1,0022f953,2454,1788969,275391,1758,112.221271,55.431189,5,17,17,...,53,385,4.41039,33,3.199496,0.000181,0.131622,0.001372,0.828944,3.5
2,0042269b,4136,1771669,421201,3005,101.837766,82.383766,4,13,18,...,47,627,5.446571,25,3.474895,0.000228,0.097679,0.002335,0.759751,6.0
3,0059420b,1556,1404469,189596,806,121.848329,113.768226,5,15,15,...,18,251,4.609562,19,2.949601,0.000147,0.132391,0.001108,0.835531,2.0
4,0075873a,2531,1662472,313702,701,123.943896,62.082013,3,11,11,...,66,412,4.76699,18,2.986064,0.000152,0.099565,0.001522,0.764103,4.0


Unnamed: 0,id,event_id_max,up_time_max,action_time_sum,action_time_max,action_time_mean,action_time_std,activity_nunique,down_event_nunique,up_event_nunique,...,text_change_14_count,punct_cnt,input_word_count,input_word_length_mean,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,0000aaaa,2,760160,172,87,86.0,1.414214,1,1,1,...,0,0,0,0.0,0,0.0,0.0,0.0,3e-06,0.554561
1,2222bbbb,2,712023,113,67,56.5,14.849242,1,1,1,...,0,0,1,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005
2,4444cccc,2,635641,150,94,75.0,26.870058,1,2,2,...,0,0,1,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962


## (5) Baseline

In [None]:
# 説明変数と目的変数を設定
X = train_feats.drop(columns = ['id', 'score'], axis=1)
y = train_feats.score
X_total = test_feats.drop(columns=['id'], axis=1)

print('train = ', X.shape, 'test = ', X_total.shape, 'target = ', y.shape)

train =  (2471, 90) test =  (3, 90) target =  (2471,)


In [None]:
# モデルの定義
model_1 = CatBoostRegressor(verbose=0)
model_2 = xgb.XGBRegressor()
model_3 = lgb.LGBMRegressor()
model_4 = RandomForestRegressor()
model_5 = HistGradientBoostingRegressor()

# クロスバリデーションを行なって精度の比較
print('Score baseline CatBoostRegressor: ', cross_val_score(model_1, X, y, cv=5, scoring='neg_root_mean_squared_error').mean())
print('Score baseline XGBRegressor: ', cross_val_score(model_2, X, y, cv=5, scoring='neg_root_mean_squared_error').mean())
print('Score baseline LGBMRegressor: ', cross_val_score(model_3, X, y, cv=5, scoring='neg_root_mean_squared_error').mean())
print('Score baseline RandomForestRegressor: ', cross_val_score(model_4, X, y, cv=5, scoring='neg_root_mean_squared_error').mean())
print('Score baseline HostGradientBoostingRegressor: ', cross_val_score(model_5, X, y, cv=5, scoring='neg_root_mean_squared_error').mean())

Score baseline CatBoostRegressor:  -0.6579840647475601
Score baseline XGBRegressor:  -0.7087934576099262
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13741
[LightGBM] [Info] Number of data points in the train set: 1976, number of used features: 88
[LightGBM] [Info] Start training from score 3.710273
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13741
[LightGBM] [Info] Number of data points in the train set: 1977, number of used features: 88
[LightGBM] [Info] Start training from score 3.709914
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13746
[LightGBM] [In

## (6) Data preparetion

In [None]:
def variance_threshold(df, th):
  var_thres = VarianceThreshold(threshold=th)
  var_thres.fit(df)
  new_cols = var_thres.get_support()
  return df.iloc[:, new_cols]

In [None]:
X_fit = variance_threshold(X, 0.2)
list_name = (X_fit.columns)
X_total_fit = X_total[list_name]

print('Shape test = ', X_total_fit.shape)
print('Shape train = ', X_fit.shape)

Shape test =  (3, 85)
Shape train =  (2471, 85)


In [None]:
X_fit = X_fit.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
X_total_fit = X_total_fit.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

print('Shape train = ', X_fit.shape)
print('Shape test = ', X_total_fit.shape)

Shape train =  (2471, 85)
Shape test =  (3, 85)


In [None]:
# 訓練データの標準化
std_ = StandardScaler()
X_std = std_.fit_transform(X_fit)
X_total_std = std_.fit_transform(X_total_fit)

## (7) Split and Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=42)

In [None]:
def objective(trial):

  ### define params grid to search maximum accuracy ###
  n_estimators = trial.suggest_int('n_estimators', 50, 120)
  max_depth = trial.suggest_int('max_depth', 10, 16)
  max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 15, 25)
  criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'absolute_error'])

  ### modeling with suggested params ###
  model = ExtraTreesRegressor(n_estimators = n_estimators,
                                 max_depth = max_depth,
                                 max_leaf_nodes = max_leaf_nodes,
                                 criterion = criterion,
                                 random_state = 0)

  ### fit ###
  model.fit(X_train, y_train)
  accuracy = mean_squared_error(y_test, model.predict(X_test))
  return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
study.best_value
study.best_trial.params

[I 2023-11-10 10:05:49,299] A new study created in memory with name: no-name-a498cf96-4bda-424d-9172-9e2556e0c38f
[I 2023-11-10 10:05:49,972] Trial 0 finished with value: 0.418816540813897 and parameters: {'n_estimators': 119, 'max_depth': 16, 'max_leaf_nodes': 23, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.418816540813897.
[I 2023-11-10 10:06:25,133] Trial 1 finished with value: 0.441582235234727 and parameters: {'n_estimators': 114, 'max_depth': 14, 'max_leaf_nodes': 25, 'criterion': 'absolute_error'}. Best is trial 1 with value: 0.441582235234727.


{'n_estimators': 114,
 'max_depth': 14,
 'max_leaf_nodes': 25,
 'criterion': 'absolute_error'}

In [None]:
def objective(trial):
  params = {
      "iterations": 1000,
      "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
      "depth": trial.suggest_int("depth", 1, 10),
      "subsample": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
      "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
      "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
  }

  model = CatBoostRegressor(**params, silent=True)
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  rmse = mean_squared_error(y_test, predictions, squared=False)
  return rmse

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective, n_trials=3)

print('Best hyperparameters: ', study_cat.best_params)
print('Best RMSE: ', study_cat.best_value)

[I 2023-11-10 10:06:25,156] A new study created in memory with name: no-name-37e0d928-1ce3-47a7-b869-2882a920e8ad
[I 2023-11-10 10:06:27,159] Trial 0 finished with value: 0.6457820341553108 and parameters: {'learning_rate': 0.0060081409681709465, 'depth': 2, 'colsample_bylevel': 0.5642986475327946, 'min_data_in_leaf': 23}. Best is trial 0 with value: 0.6457820341553108.
[I 2023-11-10 10:06:31,226] Trial 1 finished with value: 0.7020509425030601 and parameters: {'learning_rate': 0.0015014572304396085, 'depth': 3, 'colsample_bylevel': 0.8954805257540955, 'min_data_in_leaf': 63}. Best is trial 0 with value: 0.6457820341553108.
[I 2023-11-10 10:07:10,696] Trial 2 finished with value: 0.634739864605731 and parameters: {'learning_rate': 0.005034806671186455, 'depth': 7, 'colsample_bylevel': 0.9763831265991908, 'min_data_in_leaf': 44}. Best is trial 2 with value: 0.634739864605731.


Best hyperparameters:  {'learning_rate': 0.005034806671186455, 'depth': 7, 'colsample_bylevel': 0.9763831265991908, 'min_data_in_leaf': 44}
Best RMSE:  0.634739864605731


In [None]:
# ランダムフォレストの最適パラメータを探索
def objective(trial):
  criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error'])
  max_depth = trial.suggest_int('max_depth', 1, 1000)
  max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
  max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 1000)
  n_estimators = trial.suggest_int('n_estimators', 1, 1000)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

  regr = RandomForestRegressor(criterion=criterion,
                                 max_depth=max_depth, max_features=max_features,
                                 max_leaf_nodes=max_leaf_nodes, n_estimators=n_estimators,
                                 min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                 n_jobs=2)
  regr.fit(X_train, y_train)
  predictions = regr.predict(X_test)
  rmse = mean_squared_error(y_test, predictions, squared=False)

  return rmse

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective, n_trials=1)

print('Best hyperparameters: ', study_rf.best_params)
print('Best RMSE: ', study_rf.best_value)

[I 2023-11-10 10:07:10,713] A new study created in memory with name: no-name-4a9ab345-c9d6-4b31-845f-8c365262a5c8
[I 2023-11-10 10:07:12,365] Trial 0 finished with value: 0.6380175498456422 and parameters: {'criterion': 'squared_error', 'max_depth': 716, 'max_features': 'log2', 'max_leaf_nodes': 844, 'n_estimators': 299, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.6380175498456422.


Best hyperparameters:  {'criterion': 'squared_error', 'max_depth': 716, 'max_features': 'log2', 'max_leaf_nodes': 844, 'n_estimators': 299, 'min_samples_split': 4, 'min_samples_leaf': 2}
Best RMSE:  0.6380175498456422


In [None]:
# LightBGM回帰の最適パラメータを探索
def objective(trial):
  param = {
      'metric': 'rmse',
      'random_state': 48,
      'n_estimators': 20000,
      'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
      'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
      'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
      'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
      'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
      'max_depth': trial.suggest_categorical('max_depth', [10, 20, 100]),
      'num_leaves': trial.suggest_int('num_leaves', 1, 1000),
      'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
      'cat_smooth': trial.suggest_int('min_data_per_groups', 1, 100)
  }
  model = lgb.LGBMRegressor(**param)
  model.fit(X_train,y_train,eval_set=[(X_test,y_test)],callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False), lgb.log_evaluation(0)])
  preds = model.predict(X_test)

  rmse = mean_squared_error(y_test, preds, squared=False)
  return rmse

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective, n_trials=5)
print('Number of finished trials: ', len(study_lgbm.trials))
print('Best trial: ', study_lgbm.best_trial.params)

[I 2023-11-10 10:07:12,392] A new study created in memory with name: no-name-f07d323c-998f-4f96-9006-ad99d8db2ea2


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


[I 2023-11-10 10:07:13,773] Trial 0 finished with value: 0.6488196631346481 and parameters: {'reg_alpha': 6.43759060956123, 'reg_lambda': 0.04466217325035671, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 794, 'min_child_samples': 141, 'min_data_per_groups': 2}. Best is trial 0 with value: 0.6488196631346481.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


[I 2023-11-10 10:07:14,770] Trial 1 finished with value: 0.6492316475730191 and parameters: {'reg_alpha': 0.92037490096463, 'reg_lambda': 0.1229935292137859, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.014, 'max_depth': 100, 'num_leaves': 1000, 'min_child_samples': 145, 'min_data_per_groups': 45}. Best is trial 0 with value: 0.6488196631346481.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


[I 2023-11-10 10:07:16,986] Trial 2 finished with value: 0.6434794793151798 and parameters: {'reg_alpha': 9.448896960211949, 'reg_lambda': 3.3926390104447286, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 485, 'min_child_samples': 88, 'min_data_per_groups': 8}. Best is trial 2 with value: 0.6434794793151798.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


[I 2023-11-10 10:07:18,452] Trial 3 finished with value: 0.6468848577822781 and parameters: {'reg_alpha': 0.07701660078025177, 'reg_lambda': 0.26987188289074027, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 100, 'num_leaves': 136, 'min_child_samples': 119, 'min_data_per_groups': 70}. Best is trial 2 with value: 0.6434794793151798.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002969 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


[I 2023-11-10 10:07:21,894] Trial 4 finished with value: 0.6484734671118063 and parameters: {'reg_alpha': 4.280237734419415, 'reg_lambda': 0.009029127847699303, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.006, 'max_depth': 20, 'num_leaves': 971, 'min_child_samples': 154, 'min_data_per_groups': 58}. Best is trial 2 with value: 0.6434794793151798.


Number of finished trials:  5
Best trial:  {'reg_alpha': 9.448896960211949, 'reg_lambda': 3.3926390104447286, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 485, 'min_child_samples': 88, 'min_data_per_groups': 8}


In [None]:
def objective(trial):

    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 500),
        'learning_rate':trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        'subsample': trial.suggest_float("subsample", 0.05, 1.0),
        'max_depth':trial.suggest_int("max_depth", 1, 10)
    }
    model = GradientBoostingRegressor(**param)

    model.fit(X_train,y_train)

    preds = model.predict(X_test)

    rmse = mean_squared_error(y_test, preds,squared=False)

    return rmse

study_gbr = optuna.create_study(direction='minimize')
study_gbr.optimize(objective, n_trials=5)
print('Number of finished trials:', len(study_gbr.trials))
print('Best trial:', study_gbr.best_trial.params)

[I 2023-11-10 10:07:21,927] A new study created in memory with name: no-name-23d79138-00f5-4e63-8c68-a80795576ea6
[I 2023-11-10 10:07:53,781] Trial 0 finished with value: 0.6459590669401909 and parameters: {'n_estimators': 392, 'learning_rate': 0.02069408302803245, 'subsample': 0.8282282218253705, 'max_depth': 10}. Best is trial 0 with value: 0.6459590669401909.
[I 2023-11-10 10:07:54,237] Trial 1 finished with value: 0.9865046534273014 and parameters: {'n_estimators': 31, 'learning_rate': 0.0011007929346515263, 'subsample': 0.28882980558630633, 'max_depth': 5}. Best is trial 0 with value: 0.6459590669401909.
[I 2023-11-10 10:08:02,678] Trial 2 finished with value: 0.7779998644267567 and parameters: {'n_estimators': 103, 'learning_rate': 0.46385693588268895, 'subsample': 0.7820468472367826, 'max_depth': 10}. Best is trial 0 with value: 0.6459590669401909.
[I 2023-11-10 10:08:04,587] Trial 3 finished with value: 10.115049273437888 and parameters: {'n_estimators': 69, 'learning_rate': 0.

Number of finished trials: 5
Best trial: {'n_estimators': 392, 'learning_rate': 0.02069408302803245, 'subsample': 0.8282282218253705, 'max_depth': 10}


In [None]:
r1 = ExtraTreesRegressor(**study.best_trial.params)
r2 = CatBoostRegressor(**study_cat.best_params, verbose = 0)
r3 = RandomForestRegressor(**study_rf.best_params)
r4 = lgb.LGBMRegressor(**study_lgbm.best_trial.params)
r5 = AdaBoostRegressor()
r6 = GradientBoostingRegressor(**study_gbr.best_trial.params)


r1.fit(X_train,y_train)
r2.fit(X_train,y_train)
r3.fit(X_train,y_train)
r4.fit(X_train,y_train)
r5.fit(X_train,y_train)
r6.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12365
[LightGBM] [Info] Number of data points in the train set: 1729, number of used features: 83
[LightGBM] [Info] Start training from score 3.713997


## (8) Best weights

In [None]:
# 各モデルのペストパラメータを見つける
def find_best_weight(preds, target):
  def _validate_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, preds):
      final_prediction += weight * prediction
    return mean_squared_error(target, final_prediction)

  starting_values = [1/len(preds)] * len(preds)
  cons = ({'type':'eq', 'fun': lambda w: 1-sum(w)})
  # our weights are bound between 0 and 1
  bounds = [(0, 1)] * len(preds)

  res = minimize(_validate_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

  print('Ensemble Score: {best_score}'.format(best_score=(1-res['fun'])))
  print('Best Weights: {weights}'.format(weights=res['x']))

  return res

In [None]:
oof1 = r1.predict(X_test)
oof2 = r2.predict(X_test)
oof3 = r3.predict(X_test)
oof4 = r4.predict(X_test)
oof5 = r5.predict(X_test)
oof6 = r6.predict(X_test)



In [None]:
res = find_best_weight([oof1, oof2, oof3, oof4, oof5, oof6], y_test)

Ensemble Score: 0.5994537242629132
Best Weights: [4.21377747e-17 5.73776112e-01 2.24293731e-01 0.00000000e+00
 2.27332250e-02 1.79196932e-01]


In [None]:
yy = '123456'
dd = {yy[i]:k for i,k in enumerate(res['x'])}
print(dd)

{'1': 4.213777466295617e-17, '2': 0.5737761122510375, '3': 0.224293731120004, '4': 0.0, '5': 0.02273322496769512, '6': 0.1791969316612634}


## (9) Submit

In [None]:
sample = pd.read_csv('./data/sample_submission.csv')

In [None]:
sample['score'] = r1.predict(X_total_std)*dd['1'] + \
                  r2.predict(X_total_std)*dd['2'] + \
                  r3.predict(X_total_std)*dd['3'] + \
                  r4.predict(X_total_std)*dd['4'] + \
                  r5.predict(X_total_std)*dd['5'] + \
                  r6.predict(X_total_std)*dd['6']
sample



Unnamed: 0,id,score
0,0000aaaa,3.508954
1,2222bbbb,4.080879
2,4444cccc,4.174007


In [None]:
sample.to_csv('submission.csv', index=False)