In [8]:
import sys
sys.path.append('..')

import os

import pandas as pd
import tqdm
from sklearn.metrics import make_scorer

from arguments import *
from constants import *
from data import *
import models as supported_models
from training_tools import *
from utils import *



# Initialization of stuff

In [9]:
storage = GCStorage.get_CloudFS(project_name=PROJECT_NAME,
                                bucket_name=GC_BUCKET,
                                credential_path=CREDENTIAL_PATH)

Unique instance for GCStorage has been created


In [10]:
all_args = parse_from_string('--exp_name=ml --device=cpu '
                             '--currency_pair=EURUSD '
                             '--num_candles=16 --candle_interval=60 '
                             '--num_workers=1 '
                             '--log_level=4 '
                             '--exp_setting=simple_binary')

logger = all_args.misc_args.logger
train_loader, valid_loader = get_dataloaders(all_args)
loaders = {'train': train_loader, 'valid': valid_loader}

Starting new experiment at 2020-06-02 00:49:34
User: jingbo
Host: pg-cpu-1
{'misc_args': {'exp_name': 'jingbo_ml_3', 'exp_setting': 'simple_binary', 'log_level': 4, 'fast_debug': False, 'save_dir': 'experiments/2020-06-01/jingbo_ml_3', 'log_file': 'experiments/2020-06-01/jingbo_ml_3/run_log.txt'}, 'data_args': {'candle_interval': 60, 'num_candles': 16, 'num_iterval_ahead': 4, 'currency_pair': 'EURUSD', 'num_workers': 1}, 'train_args': {'loss_func': 'ce', 'device': 'cpu', 'disp_steps': 10, 'eval_steps': 99999, 'max_epochs': 100, 'batch_size': 256, 'learning_rate': 0.0001, 'weight_decay': 0.9, 'clipping_value': 1.0}, 'model_args': {'model_type': 'DummyModel', 'emb_size': 32, 'hidden_size': 64, 'num_layers': 3}}
Step 0/False Texts
	[setup/command_line: /opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py --exp_name=ml --device=cpu --currency_pair=EURUSD --num_candles=16 --candle_interval=60 --num_workers=1 --log_level=4 --exp_setting=simple_binary]
	[setup/arguments: {   'data

In [11]:
def gether_data(loader):
    all_x, all_gt = [], []
    total_len = len(loader.dataset)
    for i in range(total_len):
        x, gt = loader.dataset[i]
        if i % 500 == 0:
            logger.log_stdout(f'# [{i + 1}]/{total_len}')

        x = x.flatten()
        # gt = gt

        all_x.append(x)
        all_gt.append(gt)
    
    return all_x, all_gt

train_x, train_gt = gether_data(train_loader)

# [1]/19800
# [501]/19800
# [1001]/19800
# [1501]/19800
# [2001]/19800
# [2501]/19800
# [3001]/19800
# [3501]/19800
# [4001]/19800
# [4501]/19800
# [5001]/19800
# [5501]/19800
# [6001]/19800
# [6501]/19800
# [7001]/19800
# [7501]/19800
# [8001]/19800
# [8501]/19800
# [9001]/19800
# [9501]/19800
# [10001]/19800
# [10501]/19800
# [11001]/19800
# [11501]/19800
# [12001]/19800
# [12501]/19800
# [13001]/19800
# [13501]/19800
# [14001]/19800
# [14501]/19800
# [15001]/19800
# [15501]/19800
# [16001]/19800
# [16501]/19800
# [17001]/19800
# [17501]/19800
# [18001]/19800
# [18501]/19800
# [19001]/19800
# [19501]/19800


# Re-train the model

In [15]:
import pprint as pp
from sklearn.model_selection import GridSearchCV

classifier_list = supported_models.classifiers
metric_list = supported_models.binary_classification_metrics
all_models = {model_name: classifier_list[model_name] \
                                                for model_name in classifier_list}

model_name = 'lgb'
model = all_models[model_name]

# balanced 	8 	0 	0 	128 	127 	0.01 	0.7 	{'class_weight': 'balanced', 'max_depth': 8, '... 	0.746114
param_dict = {'min_child_samples': [0],
               'class_weight':['balanced'],
               'max_depth': [8],
               'num_leaves': [127],
               'min_split_gain': [0],
               'reg_alpha': [0.01],
               'reg_lambda': [0.7,],
               'n_estimators': [128]}

scoring_fn = make_scorer(
            metric_list['f1'][0],
            greater_is_better=metric_list['f1'][1])

model = GridSearchCV(model, param_dict, scoring=scoring_fn, cv=5, refit=True, n_jobs=28, verbose=10)
print(model)
model.fit(train_x, train_gt)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambd...
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=28,
             param_grid={'class_weight': ['balanced'], 'max_depth': [8],
                         'min

[Parallel(n_jobs=28)]: Using backend LokyBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done   2 out of   5 | elapsed:   17.4s remaining:   26.1s
[Parallel(n_jobs=28)]: Done   3 out of   5 | elapsed:   18.5s remaining:   12.3s
[Parallel(n_jobs=28)]: Done   5 out of   5 | elapsed:   21.0s remaining:    0.0s
[Parallel(n_jobs=28)]: Done   5 out of   5 | elapsed:   21.0s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambd...
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=28,
             param_grid={'class_weight': ['balanced'], 'max_depth': [8],
                         'min

# Simulate trading

In [18]:
train_days, valid_days = TRAIN_DAYS, VALID_DAYS

# Cheat to get the dataset class
dataset_class = globals()['SingleFXDatasetBase']

valid_dataset = dataset_class(valid_days, all_args.misc_args.logger, all_args.data_args)

In [24]:
for i in range(len(valid_dataset)):
    sample = valid_dataset.getitem(i)
    print(sample[0])
    print(sample[1])
    break

[[  1.13389   1.13383   1.13379   1.13377   1.13377   1.1339    1.13395
    1.13401   1.134     1.13418   1.13438   1.13432   1.13415   1.1341
    1.13409   1.13409]
 [  1.13417   1.13389   1.13385   1.13381   1.13377   1.13392   1.13395
    1.13403   1.13401   1.13418   1.13439   1.13441   1.13432   1.13419
    1.13411   1.13409]
 [  1.13387   1.1338    1.13377   1.13377   1.13377   1.13377   1.1339
    1.13392   1.134     1.13397   1.13415   1.13431   1.13414   1.1341
    1.13409   1.13406]
 [  1.13417   1.13388   1.13382   1.13381   1.13377   1.13378   1.13391
    1.13395   1.13401   1.13399   1.13417   1.13438   1.13431   1.13416
    1.13411   1.13409]
 [167.       44.       87.       15.        0.       66.       38.
   72.        6.       74.      180.       68.       58.       62.
    9.       21.     ]
 [  1.13398   1.1339    1.13391   1.13384   1.13384   1.13398   1.13401
    1.13408   1.13406   1.13425   1.13444   1.13437   1.13423   1.13417
    1.13416   1.13416]
 [  1.13424