### LightGBM

In [31]:
import lightgbm as lgb
import pandas as pd

In [32]:

import pickle
with open('./market_data.pkl', 'rb') as f:
    market_data_x,market_data_y = pickle.load(f)

In [33]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

In [34]:
DIMENSIONS = 25 #num_leaves, min_child_samples, n_estimators, learning_rate, subsample_for_bin, min_split_gain, min_child_weight, subsample, reg_alpha, reg_lambda
ITERATIONS = 10
POPULATION = 15
num_leaves_min = 10
num_leaves_max = 500
min_child_samples_min = 10
min_child_samples_max = 50
n_estimators_min = 20
n_estimators_max = 1000

learning_rate_min = 0.001
learning_rate_max = 0.5
subsample_for_bin_min = 50000
subsample_for_bin_max = 500000
min_split_gain_min = 0.01
min_split_gain_max = 0.5
min_child_weight_min = 0.001
min_child_weight_max = 0.15
subsample_min = 1.0
subsample_max = 2.0
reg_alpha_min =0.01
reg_alpha_max = 100.0 
reg_lambda_min = 0.01
reg_lambda_max = 100.0

In [35]:
max_depth_min = 1
bagging_fraction_min = 0.0
pos_bagging_fraction_min = 0.0
neg_bagging_fraction_min = 0.0
bagging_freq_min = 0
feature_fraction_min = 0.0
feature_fraction_bynode_min = 0.0
extra_trees_min = 0 # boolean
first_metric_only_min = 0 # boolean
max_delta_step_min = -10.0
linear_lambda_min = 0.0
min_data_per_group_min = 1
max_cat_threshold_min = 1
cat_l2_min = 0.0
cat_smooth_min = 0.0
max_cat_to_onehot_min = 1
top_k_min = 1

max_depth_max = 500 #int
bagging_fraction_max = 1.0
pos_bagging_fraction_max = 1.0
neg_bagging_fraction_max = 1.0
bagging_freq_max = 100 #int
feature_fraction_max = 1.0
feature_fraction_bynode_max = 1.0
extra_trees_max = 1 # boolean
first_metric_only_max = 1 # boolean
max_delta_step_max = 10.0
linear_lambda_max = 10.0
min_data_per_group_max = 200 #int
max_cat_threshold_max = 100 #int
cat_l2_max = 20.0
cat_smooth_max = 20.0
max_cat_to_onehot_max = 20 #int
top_k_max = 40 #int

In [36]:
import optuna
import sklearn
#optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial, metric = 'F1'):
  
      num_leaves = trial.suggest_int('num_leaves', num_leaves_min, num_leaves_max) #int
      min_child_samples = trial.suggest_int('min_child_samples', min_child_samples_min, min_child_samples_max) #int
      n_estimators      = trial.suggest_int('n_estimators', n_estimators_min, n_estimators_max)#int
      learning_rate     = trial.suggest_float('learning_rate', learning_rate_min, learning_rate_max)
      subsample_for_bin = trial.suggest_int('subsample_for_bin', subsample_for_bin_min, subsample_for_bin_max) #int
      min_split_gain    = trial.suggest_float('min_split_gain', min_split_gain_min, min_split_gain_max)
      min_child_weight  = trial.suggest_float('min_child_weight', min_child_weight_min, min_child_weight_max)
      reg_alpha         = trial.suggest_float('reg_alpha', reg_alpha_min, reg_alpha_max)
      reg_lambda        = trial.suggest_float('reg_lambda', reg_lambda_min, reg_lambda_max)
      max_depth = trial.suggest_int('max_depth', max_depth_min, max_depth_max)
      bagging_fraction = trial.suggest_float('bagging_fraction', bagging_fraction_min, bagging_fraction_max)
      pos_bagging_fraction = trial.suggest_float('pos_bagging_fraction', pos_bagging_fraction_min, pos_bagging_fraction_max)
      neg_bagging_fraction = trial.suggest_float('neg_bagging_fraction', neg_bagging_fraction_min, neg_bagging_fraction_max)
      bagging_freq = trial.suggest_int('bagging_freq', bagging_freq_min, bagging_freq_max)
      feature_fraction = trial.suggest_float('feature_fraction', feature_fraction_min, feature_fraction_max)
      feature_fraction_bynode = trial.suggest_float('feature_fraction_bynode', feature_fraction_bynode_min, feature_fraction_bynode_max)
      #extra_trees = trial.suggest_int('extra_trees', extra_trees_min, extra_trees_max)
      #first_metric_only = trial.suggest_int('first_metric_only', first_metric_only_min, first_metric_only_max)
      max_delta_step = trial.suggest_float('max_delta_step', max_delta_step_min, max_delta_step_max)
      linear_lambda = trial.suggest_float('linear_lambda', linear_lambda_min, linear_lambda_max)
      min_data_per_group = trial.suggest_int('min_data_per_group', min_data_per_group_min, min_data_per_group_max)
      max_cat_threshold = trial.suggest_int('max_cat_threshold', max_cat_threshold_min, max_cat_threshold_max)
      cat_l2 = trial.suggest_float('cat_l2', cat_l2_min, cat_l2_max)
      cat_smooth = trial.suggest_float('cat_smooth', cat_smooth_min, cat_smooth_max)
      max_cat_to_onehot = trial.suggest_int('max_cat_to_onehot', max_cat_to_onehot_min, max_cat_to_onehot_max)
      top_k = trial.suggest_int('top_k', top_k_min, top_k_max)

      final_model = lgb.LGBMClassifier(num_leaves = num_leaves,
                                          min_child_samples = min_child_samples,
                                          n_estimators = n_estimators,
                                          learning_rate = learning_rate,
                                          subsample_for_bin = subsample_for_bin,
                                          min_split_gain = min_split_gain,
                                          min_child_weight = min_child_weight,
                                          reg_alpha = reg_alpha,
                                          reg_lambda = reg_lambda,
                                          max_depth = max_depth,
                                          bagging_fraction = bagging_fraction,
                                          pos_bagging_fraction = pos_bagging_fraction,
                                          neg_bagging_fraction = neg_bagging_fraction,
                                          bagging_freq = bagging_freq,
                                          feature_fraction = feature_fraction,
                                          feature_fraction_bynode = feature_fraction_bynode,
                                          #extra_trees = bool(extra_trees),
                                          max_delta_step = max_delta_step,
                                          linear_lambda = linear_lambda,
                                          min_data_per_group = min_data_per_group,
                                          max_cat_threshold = max_cat_threshold,
                                          cat_l2 = cat_l2,
                                          cat_smooth = cat_smooth,
                                          max_cat_to_onehot = max_cat_to_onehot,
                                          top_k = top_k,
                                          n_jobs = -1,
                                      )
      
      kfold = StratifiedKFold(n_splits = 4, shuffle = True)

      if metric == 'Accuracy':  
        return sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, cv=kfold).mean()
      elif metric == 'F1':
        return sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()
      elif metric == 'AUC':
        return sklearn.model_selection.cross_val_score(final_model, market_data_x, market_data_y, n_jobs=-1, scoring='roc_auc_ovr_weighted', cv=kfold).mean()

In [37]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [38]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [39]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-15 08:22:35,371][0m A new study created in memory with name: no-name-bf6d06ca-f327-4c96-a306-73e2f8b4b7ec[0m
[32m[I 2023-02-15 08:22:35,496][0m Trial 1 finished with value: 0.39310546304411337 and parameters: {'num_leaves': 288, 'min_child_samples': 49, 'n_estimators': 495, 'learning_rate': 0.06778966430164095, 'subsample_for_bin': 309490, 'min_split_gain': 0.3323753063851209, 'min_child_weight': 0.037296100510311396, 'reg_alpha': 15.669120148219895, 'reg_lambda': 11.688498413560241, 'max_depth': 113, 'bagging_fraction': 0.15400013408227764, 'pos_bagging_fraction': 0.72420571089578, 'neg_bagging_fraction': 0.3190827964530304, 'bagging_freq': 72, 'feature_fraction': 0.4202481525017775, 'feature_fraction_bynode': 0.11508362245061088, 'max_delta_step': 4.399013115600113, 'linear_lambda': 8.913145444549276, 'min_data_per_group': 17, 'max_cat_threshold': 66, 'cat_l2': 3.1740275332795798, 'cat_smooth': 18.92378164931728, 'max_cat_to_onehot': 17, 'top_k': 39}. Best is tria

Metric: 0.6550589262635675
Best hyperparameters: {'num_leaves': 161, 'min_child_samples': 11, 'n_estimators': 636, 'learning_rate': 0.10062275564036395, 'subsample_for_bin': 385223, 'min_split_gain': 0.21408527420940948, 'min_child_weight': 0.09618485416592447, 'reg_alpha': 2.907281344473718, 'reg_lambda': 8.063939094792097, 'max_depth': 483, 'bagging_fraction': 0.710449196813832, 'pos_bagging_fraction': 0.8065057799537635, 'neg_bagging_fraction': 0.8611970483686582, 'bagging_freq': 80, 'feature_fraction': 0.8100769087394183, 'feature_fraction_bynode': 0.7449109024236781, 'max_delta_step': -4.050687163559136, 'linear_lambda': 5.0106456743154615, 'min_data_per_group': 24, 'max_cat_threshold': 87, 'cat_l2': 12.563973689983793, 'cat_smooth': 4.5814492074654165, 'max_cat_to_onehot': 7, 'top_k': 4}


In [40]:
optuna.visualization.plot_optimization_history(study)

In [41]:
optuna.visualization.plot_slice(study)

In [42]:
final_model = lgb.LGBMClassifier(**params)
kfold = StratifiedKFold(n_splits = 4, shuffle = True)
sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

0.6189491211020381

### KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier
def objective_knn(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
    return sklearn.model_selection.cross_val_score(knn, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

In [44]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_knn, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [45]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-15 08:23:21,251][0m A new study created in memory with name: no-name-680647b6-6e53-4846-9e5f-b5f3c1178924[0m
[32m[I 2023-02-15 08:23:21,378][0m Trial 4 finished with value: 0.5436514178193725 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 4 with value: 0.5436514178193725.[0m
[32m[I 2023-02-15 08:23:21,403][0m Trial 0 finished with value: 0.6533006697485783 and parameters: {'n_neighbors': 32, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.6533006697485783.[0m
[32m[I 2023-02-15 08:23:21,410][0m Trial 2 finished with value: 0.6343263059560482 and parameters: {'n_neighbors': 47, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.6533006697485783.[0m
[32m[I 2023-02-15 08:23:21,415][0m Trial 6 finished with value: 0.633830416615981 and parameters: {'n_neighbors': 30, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.6533006697485783.[0

Metric: 0.6925628784766271
Best hyperparameters: {'n_neighbors': 39, 'weights': 'uniform', 'metric': 'manhattan'}


In [46]:
optuna.visualization.plot_optimization_history(study)

In [47]:
optuna.visualization.plot_slice(study)

### NaiveBayes

In [48]:
from sklearn.naive_bayes import GaussianNB

naiveBayes = GaussianNB()
kfold = StratifiedKFold(n_splits = 4, shuffle = True)
sklearn.model_selection.cross_val_score(naiveBayes, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

0.6452524723877698

### SVM

In [49]:
from sklearn.svm import SVC
def objective_svm(trial):
    C = trial.suggest_float("C", 0.01, 1000.0)
    degree = trial.suggest_int("degree", 1, 10)
    kernel = trial.suggest_categorical("kernel", ['rbf', 'poly', 'sigmoid', 'linear'])
    kfold = StratifiedKFold(n_splits = 4, shuffle = True)
    svm = SVC(C=C, degree=degree, kernel=kernel)
    return sklearn.model_selection.cross_val_score(svm, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

In [50]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_svm, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [52]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-15 08:24:12,518][0m A new study created in memory with name: no-name-d0230196-8595-4d2c-8a09-ac4c4635b922[0m
[32m[I 2023-02-15 08:24:12,576][0m Trial 1 finished with value: 0.45391441121202974 and parameters: {'C': 775.6506629284683, 'degree': 6, 'kernel': 'poly'}. Best is trial 1 with value: 0.45391441121202974.[0m
[32m[I 2023-02-15 08:24:12,584][0m Trial 0 finished with value: 0.5286156044486059 and parameters: {'C': 162.43903055170733, 'degree': 8, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.5286156044486059.[0m
[32m[I 2023-02-15 08:24:12,620][0m Trial 8 finished with value: 0.5138011165528447 and parameters: {'C': 154.12557461647123, 'degree': 5, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.5286156044486059.[0m
[32m[I 2023-02-15 08:24:12,622][0m Trial 9 finished with value: 0.39310546304411337 and parameters: {'C': 662.5591204047851, 'degree': 6, 'kernel': 'poly'}. Best is trial 0 with value: 0.5286156044486059.[0m
[32m[I 2023-02-15 08:

Metric: 0.6760092863444682
Best hyperparameters: {'C': 442.2401801990492, 'degree': 8, 'kernel': 'linear'}
