# MARS pjt tutorial 

In [1]:
# Auto reloading 
%load_ext autoreload 
%autoreload 2

## 1. Data crawling

In [2]:
from lib import Crawler
if False: 
    ticker_list = [
        'QQQ', 'UVXY', 'XOP', 'IEF', 'SHY'
    ] 
    from_date = '2001-01-01' 
    to_date = '2021-12-12'

    crawler = Crawler(from_date = from_date) 
    crawler.run_etf(ticker_list) 


### 2. Data preprocessing

In [3]:
from lib import Prep

# Params
dataset_name = 'tutorial'
target_period = 5 
scaling_price = 'Close'
scaling_amount = 'Lvolume'
long_period = 100
mid_period = 45
short_period = 10
test_size = 0.1
valid_size = 0.2
random_seed = 42

proc = Prep( 
    dataset_name = dataset_name, 
    target_period = target_period, 
    scaling_price = scaling_price,
    scaling_amount=scaling_amount ,  
    long_period=long_period ,  
    mid_period= mid_period,  
    short_period= short_period,  
    test_size=test_size ,  
    valid_size=valid_size ,  
    random_seed = random_seed 
) 
proc.run()


[32mdatasets saved at ./data/tutorial[0m
[32m# of train sample : 1959[0m
[32m# of valid sample : 490[0m
[32m# of test sample : 273[0m


In [4]:
from lib import StockDataFrame
dg = StockDataFrame()
dg.list_datagroup()

['tutorial']

In [5]:
dg.load_datagroup('tutorial')
dg

[32mdataset is loaded from ./data/tutorial[0m
Data group "tutorial" Spec.
___________________
[32mycol[0m
[34m['label|IEF', 'label|QQQ', 'label|GLD', 'label|SHY', 'label|UVXY', 'label|XOP'][0m
[32mxcol[0m
[34m['num|Open_IEF', 'num|High_IEF', 'num|Low_IEF', 'num|Adj Close_IEF', 'num|Volume_IEF', 'num|Lhigh_IEF', 'num|Llow_IEF', 'num|Lmean_IEF', 'num|Mhigh_IEF', 'num|Mlow_IEF', 'num|Mmean_IEF', 'num|Shigh_IEF', 'num|Slow_IEF', 'num|Smean_IEF', 'num|Mvolume_IEF', 'num|Svolume_IEF', 'num|Open_QQQ', 'num|High_QQQ', 'num|Low_QQQ', 'num|Adj Close_QQQ', 'num|Volume_QQQ', 'num|Lhigh_QQQ', 'num|Llow_QQQ', 'num|Lmean_QQQ', 'num|Mhigh_QQQ', 'num|Mlow_QQQ', 'num|Mmean_QQQ', 'num|Shigh_QQQ', 'num|Slow_QQQ', 'num|Smean_QQQ', 'num|Mvolume_QQQ', 'num|Svolume_QQQ', 'num|Open_GLD', 'num|High_GLD', 'num|Low_GLD', 'num|Adj Close_GLD', 'num|Volume_GLD', 'num|Lhigh_GLD', 'num|Llow_GLD', 'num|Lmean_GLD', 'num|Mhigh_GLD', 'num|Mlow_GLD', 'num|Mmean_GLD', 'num|Shigh_GLD', 'num|Slow_GLD', 'num|Smean_GLD

_________________________

In [6]:
dg.to_lgbm() 
dg.lgbm_datagroup

{'train': {'label|IEF': <lightgbm.basic.Dataset at 0x15a9a6190>,
  'label|QQQ': <lightgbm.basic.Dataset at 0x15a9addf0>,
  'label|GLD': <lightgbm.basic.Dataset at 0x15a9adb80>,
  'label|SHY': <lightgbm.basic.Dataset at 0x15a9ad8b0>,
  'label|UVXY': <lightgbm.basic.Dataset at 0x15a9ad610>,
  'label|XOP': <lightgbm.basic.Dataset at 0x15a9ad370>},
 'valid': {'label|IEF': <lightgbm.basic.Dataset at 0x15a9ad070>,
  'label|QQQ': <lightgbm.basic.Dataset at 0x1476b9c40>,
  'label|GLD': <lightgbm.basic.Dataset at 0x15a9b4b80>,
  'label|SHY': <lightgbm.basic.Dataset at 0x15a9b48e0>,
  'label|UVXY': <lightgbm.basic.Dataset at 0x15a9b4640>,
  'label|XOP': <lightgbm.basic.Dataset at 0x15a9b43a0>},
 'test': {'label|IEF': <lightgbm.basic.Dataset at 0x15a9b40a0>,
  'label|QQQ': <lightgbm.basic.Dataset at 0x15a9b9dc0>,
  'label|GLD': <lightgbm.basic.Dataset at 0x15a9b9b20>,
  'label|SHY': <lightgbm.basic.Dataset at 0x15a9b9880>,
  'label|UVXY': <lightgbm.basic.Dataset at 0x15a9b95e0>,
  'label|XOP': <l

## 3. Model training 

In [7]:
from lib import LgbmTrainInstance
instance = LgbmTrainInstance(instance_name= 'test_instance') 
instance

Instance test_instance Spec.
_____________________
[32mis trained?: False[0m
[32mis tuned?: False[0m
[32mdatagroup name: [0m
[34mparams: None[0m


______________________

In [8]:
# train instance 
params = {
    'boosting_type' : 'gbdt',
    "n_estimators" : 10000,
    'max_depth': 8, 
    'num_leaves': 6, 
    'random_state': 17,
    'metric': 'mae', 
}
instance.train(dg , **params)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23988
[LightGBM] [Info] Number of data points in the train set: 1959, number of used features: 96
[LightGBM] [Info] Start training from score 1.000081
Training until validation scores don't improve for 300 rounds
[100]	valid_0's l1: 0.00327833
[200]	valid_0's l1: 0.00314041
[300]	valid_0's l1: 0.00304484
[400]	valid_0's l1: 0.0029813
[500]	valid_0's l1: 0.00293112
[600]	valid_0's l1: 0.00289992
[700]	valid_0's l1: 0.00287722
[800]	valid_0's l1: 0.00286542
[900]	valid_0's l1: 0.00286226
[1000]	valid_0's l1: 0.00285129
[1100]	valid_0's l1: 0.00283936
[1200]	valid_0's l1: 0.00282876
[1300]	valid_0's l1: 0.00282003
[1400]	valid_0's l1: 0.00281439
[1500]	valid_0's l1: 0.00281295
[1600]	valid_0's l1: 0.00281106
[1700]	valid_0's l1: 0.0028111
[1800]	valid_0's l1: 0.00281172
[1900]	valid_0's l1: 0.00281051
[2000]	valid_0's l1: 0.00281039
[2100]	valid_0's l1: 0.00281087
[2200]	valid_0's l1: 0.00281002
[2300]	

______________________

In [9]:
# Save instance 
instance.save_instance() 

Instance test_instance Spec.
_____________________
[32mis trained?: True[0m
[32mis tuned?: False[0m
[32mdatagroup name: tutorial[0m
[34mparams: {'boosting_type': 'gbdt', 'n_estimators': 10000, 'max_depth': 8, 'num_leaves': 6, 'random_state': 17, 'metric': 'mae'}[0m


In [14]:
# Load train instance 
instance.load_instance(instance_name='test_instance')

Instance test_instance Spec.
_____________________
[32mis trained?: True[0m
[32mis tuned?: False[0m
[32mdatagroup name: tutorial[0m
[34mparams: {'boosting_type': 'gbdt', 'n_estimators': 10000, 'max_depth': 8, 'num_leaves': 6, 'random_state': 17, 'metric': 'mae'}[0m
______________________


In [10]:
# Tune 

instance.tune(_param_space) 

def _param_space(trial): 
    param_space= {
        'boosting_type' : 'gbdt',
        "n_estimators" : 10000,
        'max_depth':trial.suggest_int('max_depth', 4, 16),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 8, 32),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 16, 64),
        'verbose' : -1, 
    }
    return param_space

NameError: name 'trial' is not defined

## 4. Model predict

In [None]:
# predict 



## 5. Demonstration