### LightGBM

In [1]:
import lightgbm as lgb
import pandas as pd

In [2]:

import pickle
with open('./market_data.pkl', 'rb') as f:
    market_data_x,market_data_y = pickle.load(f)

In [3]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

In [4]:
DIMENSIONS = 25 #num_leaves, min_child_samples, n_estimators, learning_rate, subsample_for_bin, min_split_gain, min_child_weight, subsample, reg_alpha, reg_lambda
ITERATIONS = 10
POPULATION = 15
num_leaves_min = 10
num_leaves_max = 500
min_child_samples_min = 10
min_child_samples_max = 50
n_estimators_min = 20
n_estimators_max = 1000

learning_rate_min = 0.001
learning_rate_max = 0.5
subsample_for_bin_min = 50000
subsample_for_bin_max = 500000
min_split_gain_min = 0.01
min_split_gain_max = 0.5
min_child_weight_min = 0.001
min_child_weight_max = 0.15
subsample_min = 1.0
subsample_max = 2.0
reg_alpha_min =0.01
reg_alpha_max = 100.0 
reg_lambda_min = 0.01
reg_lambda_max = 100.0

In [5]:
max_depth_min = 1
bagging_fraction_min = 0.0
pos_bagging_fraction_min = 0.0
neg_bagging_fraction_min = 0.0
bagging_freq_min = 0
feature_fraction_min = 0.0
feature_fraction_bynode_min = 0.0
extra_trees_min = 0 # boolean
first_metric_only_min = 0 # boolean
max_delta_step_min = -10.0
linear_lambda_min = 0.0
min_data_per_group_min = 1
max_cat_threshold_min = 1
cat_l2_min = 0.0
cat_smooth_min = 0.0
max_cat_to_onehot_min = 1
top_k_min = 1

max_depth_max = 500 #int
bagging_fraction_max = 1.0
pos_bagging_fraction_max = 1.0
neg_bagging_fraction_max = 1.0
bagging_freq_max = 100 #int
feature_fraction_max = 1.0
feature_fraction_bynode_max = 1.0
extra_trees_max = 1 # boolean
first_metric_only_max = 1 # boolean
max_delta_step_max = 10.0
linear_lambda_max = 10.0
min_data_per_group_max = 200 #int
max_cat_threshold_max = 100 #int
cat_l2_max = 20.0
cat_smooth_max = 20.0
max_cat_to_onehot_max = 20 #int
top_k_max = 40 #int

In [6]:
import optuna
import sklearn
#optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial, metric = 'F1'):
  
      num_leaves = trial.suggest_int('num_leaves', num_leaves_min, num_leaves_max) #int
      min_child_samples = trial.suggest_int('min_child_samples', min_child_samples_min, min_child_samples_max) #int
      n_estimators      = trial.suggest_int('n_estimators', n_estimators_min, n_estimators_max)#int
      learning_rate     = trial.suggest_float('learning_rate', learning_rate_min, learning_rate_max)
      subsample_for_bin = trial.suggest_int('subsample_for_bin', subsample_for_bin_min, subsample_for_bin_max) #int
      min_split_gain    = trial.suggest_float('min_split_gain', min_split_gain_min, min_split_gain_max)
      min_child_weight  = trial.suggest_float('min_child_weight', min_child_weight_min, min_child_weight_max)
      reg_alpha         = trial.suggest_float('reg_alpha', reg_alpha_min, reg_alpha_max)
      reg_lambda        = trial.suggest_float('reg_lambda', reg_lambda_min, reg_lambda_max)
      max_depth = trial.suggest_int('max_depth', max_depth_min, max_depth_max)
      bagging_fraction = trial.suggest_float('bagging_fraction', bagging_fraction_min, bagging_fraction_max)
      pos_bagging_fraction = trial.suggest_float('pos_bagging_fraction', pos_bagging_fraction_min, pos_bagging_fraction_max)
      neg_bagging_fraction = trial.suggest_float('neg_bagging_fraction', neg_bagging_fraction_min, neg_bagging_fraction_max)
      bagging_freq = trial.suggest_int('bagging_freq', bagging_freq_min, bagging_freq_max)
      feature_fraction = trial.suggest_float('feature_fraction', feature_fraction_min, feature_fraction_max)
      feature_fraction_bynode = trial.suggest_float('feature_fraction_bynode', feature_fraction_bynode_min, feature_fraction_bynode_max)
      #extra_trees = trial.suggest_int('extra_trees', extra_trees_min, extra_trees_max)
      #first_metric_only = trial.suggest_int('first_metric_only', first_metric_only_min, first_metric_only_max)
      max_delta_step = trial.suggest_float('max_delta_step', max_delta_step_min, max_delta_step_max)
      linear_lambda = trial.suggest_float('linear_lambda', linear_lambda_min, linear_lambda_max)
      min_data_per_group = trial.suggest_int('min_data_per_group', min_data_per_group_min, min_data_per_group_max)
      max_cat_threshold = trial.suggest_int('max_cat_threshold', max_cat_threshold_min, max_cat_threshold_max)
      cat_l2 = trial.suggest_float('cat_l2', cat_l2_min, cat_l2_max)
      cat_smooth = trial.suggest_float('cat_smooth', cat_smooth_min, cat_smooth_max)
      max_cat_to_onehot = trial.suggest_int('max_cat_to_onehot', max_cat_to_onehot_min, max_cat_to_onehot_max)
      top_k = trial.suggest_int('top_k', top_k_min, top_k_max)

      final_model = lgb.LGBMClassifier(num_leaves = num_leaves,
                                          min_child_samples = min_child_samples,
                                          n_estimators = n_estimators,
                                          learning_rate = learning_rate,
                                          subsample_for_bin = subsample_for_bin,
                                          min_split_gain = min_split_gain,
                                          min_child_weight = min_child_weight,
                                          reg_alpha = reg_alpha,
                                          reg_lambda = reg_lambda,
                                          max_depth = max_depth,
                                          bagging_fraction = bagging_fraction,
                                          pos_bagging_fraction = pos_bagging_fraction,
                                          neg_bagging_fraction = neg_bagging_fraction,
                                          bagging_freq = bagging_freq,
                                          feature_fraction = feature_fraction,
                                          feature_fraction_bynode = feature_fraction_bynode,
                                          #extra_trees = bool(extra_trees),
                                          max_delta_step = max_delta_step,
                                          linear_lambda = linear_lambda,
                                          min_data_per_group = min_data_per_group,
                                          max_cat_threshold = max_cat_threshold,
                                          cat_l2 = cat_l2,
                                          cat_smooth = cat_smooth,
                                          max_cat_to_onehot = max_cat_to_onehot,
                                          top_k = top_k,
                                          n_jobs = -1,
                                      )
      
      kfold = StratifiedKFold(n_splits = 4, shuffle = True)

      if metric == 'Accuracy':  
        return sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, cv=kfold).mean()
      elif metric == 'F1':
        return sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()
      elif metric == 'AUC':
        return sklearn.model_selection.cross_val_score(final_model, market_data_x, market_data_y, n_jobs=-1, scoring='roc_auc_ovr_weighted', cv=kfold).mean()

In [7]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [8]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [9]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-13 22:41:40,211][0m A new study created in memory with name: no-name-68152bca-06f0-4d67-8e30-b3ada53e2b42[0m
[32m[I 2023-02-13 22:41:42,942][0m Trial 1 finished with value: 0.31172006326179724 and parameters: {'num_leaves': 93, 'min_child_samples': 37, 'n_estimators': 46, 'learning_rate': 0.15093561876361128, 'subsample_for_bin': 108988, 'min_split_gain': 0.0366407645747806, 'min_child_weight': 0.045877616568608864, 'reg_alpha': 79.70379911359292, 'reg_lambda': 69.06281417885802, 'max_depth': 482, 'bagging_fraction': 0.7270360047186235, 'pos_bagging_fraction': 0.290796590076811, 'neg_bagging_fraction': 0.7635243876317528, 'bagging_freq': 49, 'feature_fraction': 0.021773778220072204, 'feature_fraction_bynode': 0.7646096849767063, 'max_delta_step': 0.03820574521611597, 'linear_lambda': 6.555798076750986, 'min_data_per_group': 103, 'max_cat_threshold': 74, 'cat_l2': 11.363285940104584, 'cat_smooth': 8.02241450071449, 'max_cat_to_onehot': 1, 'top_k': 14}. Best is trial 

Metric: 0.637568830357082
Best hyperparameters: {'num_leaves': 462, 'min_child_samples': 13, 'n_estimators': 782, 'learning_rate': 0.33555840726183195, 'subsample_for_bin': 127442, 'min_split_gain': 0.37948520042276485, 'min_child_weight': 0.10725828568645597, 'reg_alpha': 4.092874623926836, 'reg_lambda': 21.73073731421249, 'max_depth': 95, 'bagging_fraction': 0.709489125393079, 'pos_bagging_fraction': 0.1684026916710285, 'neg_bagging_fraction': 0.2609373040557499, 'bagging_freq': 99, 'feature_fraction': 0.7832856690456551, 'feature_fraction_bynode': 0.8113363295716135, 'max_delta_step': -8.168215958765362, 'linear_lambda': 5.574562851071012, 'min_data_per_group': 33, 'max_cat_threshold': 87, 'cat_l2': 13.750801517155896, 'cat_smooth': 11.279586023815252, 'max_cat_to_onehot': 7, 'top_k': 27}


In [10]:
optuna.visualization.plot_optimization_history(study)

In [11]:
optuna.visualization.plot_slice(study)

In [43]:
final_model = lgb.LGBMClassifier(**params)
kfold = StratifiedKFold(n_splits = 4, shuffle = True)
sklearn.model_selection.cross_val_score(final_model, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

0.6229217098051381

### KNN

In [55]:
from sklearn.neighbors import KNeighborsClassifier
def objective_knn(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
    return sklearn.model_selection.cross_val_score(knn, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

In [56]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_knn, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [57]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-13 22:48:48,084][0m A new study created in memory with name: no-name-d304b49e-1e96-417c-be79-681445235fee[0m
[32m[I 2023-02-13 22:48:48,157][0m Trial 3 finished with value: 0.5024077009662038 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 3 with value: 0.5024077009662038.[0m
[32m[I 2023-02-13 22:48:48,166][0m Trial 1 finished with value: 0.5124470813859501 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 1 with value: 0.5124470813859501.[0m
[32m[I 2023-02-13 22:48:48,179][0m Trial 0 finished with value: 0.630061349336727 and parameters: {'n_neighbors': 45, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.630061349336727.[0m
[32m[I 2023-02-13 22:48:48,184][0m Trial 8 finished with value: 0.4939991421504887 and parameters: {'n_neighbors': 17, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.630061349336727.[0

Metric: 0.6520569973253888
Best hyperparameters: {'n_neighbors': 44, 'weights': 'uniform', 'metric': 'euclidean'}


In [58]:
optuna.visualization.plot_optimization_history(study)

In [59]:
optuna.visualization.plot_slice(study)

### NaiveBayes

In [60]:
from sklearn.naive_bayes import GaussianNB

naiveBayes = GaussianNB()
kfold = StratifiedKFold(n_splits = 4, shuffle = True)
sklearn.model_selection.cross_val_score(naiveBayes, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

0.650686493062949

### SVM

In [69]:
from sklearn.svm import SVC
def objective_svm(trial):
    C = trial.suggest_float("C", 0.01, 1000.0)
    degree = trial.suggest_int("degree", 1, 10)
    kernel = trial.suggest_categorical("kernel", ['rbf', 'poly', 'sigmoid', 'linear'])
    kfold = StratifiedKFold(n_splits = 4, shuffle = True)
    svm = SVC(C=C, degree=degree, kernel=kernel)
    return sklearn.model_selection.cross_val_score(svm, market_data_x,market_data_y, n_jobs=-1, scoring='f1_weighted', cv=kfold).mean()

In [70]:
def run_optuna(n_trials=1024):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_svm, n_trials=n_trials, n_jobs = -1)
    trial = study.best_trial
    print('Metric: {}'.format(trial.value))
    print("Best hyperparameters: {}".format(trial.params))
    return trial.params, trial.value, study

In [71]:
params, final_score, study = run_optuna(350)

[32m[I 2023-02-13 22:59:29,586][0m A new study created in memory with name: no-name-b4cd0c6a-9209-461d-a614-93f7817e88ab[0m
[32m[I 2023-02-13 22:59:29,900][0m Trial 0 finished with value: 0.47376946392130087 and parameters: {'C': 91.19282542845787, 'degree': 10, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.47376946392130087.[0m
[32m[I 2023-02-13 22:59:29,937][0m Trial 2 finished with value: 0.48870768202581594 and parameters: {'C': 709.2950853085794, 'degree': 7, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.48870768202581594.[0m
[32m[I 2023-02-13 22:59:30,249][0m Trial 3 finished with value: 0.376738927265698 and parameters: {'C': 688.3273648014004, 'degree': 5, 'kernel': 'poly'}. Best is trial 2 with value: 0.48870768202581594.[0m
[32m[I 2023-02-13 22:59:30,557][0m Trial 4 finished with value: 0.6137605855953634 and parameters: {'C': 126.50048822229114, 'degree': 9, 'kernel': 'linear'}. Best is trial 4 with value: 0.6137605855953634.[0m
[32m[I 2023-02-13 

Metric: 0.6584954040480869
Best hyperparameters: {'C': 845.1694788930546, 'degree': 2, 'kernel': 'linear'}
