### How to simulate
1. perform model retrain every n days
2. model retrain using all historical data (normalize price with first day open price for each game)
3. rolling forward account balance

In [1]:
import sys

sys.path.append('../')

In [2]:
import pandas as pd
import numpy as np

In [3]:
# read whole simulation data
stock_history_df = pd.concat([pd.read_csv(i) for i in ['./ETH_hist_test.csv', './ETH_hist.csv']])

In [4]:
stock_history_df.sort_values(by = 'time', inplace = True)
stock_history_df.fillna(1e-10, inplace = True)

In [5]:
stock_history_df.head(2)

Unnamed: 0,time,open,high,low,close,adjcp,volume,tic,cci_30,rsi_30,rsi_14,rsi_6,dx_30,dx_14
0,2019-12-31,132.612274,133.732681,128.798157,129.610855,129.610855,8936866000.0,ETH-USD,1e-10,1e-10,1e-10,1e-10,1e-10,1e-10
1,2020-01-01,129.630661,132.835358,129.198288,130.802002,130.802002,7935230000.0,ETH-USD,66.66667,100.0,100.0,100.0,1e-10,1e-10


In [6]:
# use first 250 for first model
start_trade_index = 250
# consequence model retrain point
model_retrain_interval = 30

tech_indicators = ['cci_30',
 'rsi_30',
 'rsi_14',
 'rsi_6',
 'dx_30', 
 'dx_14']

cwd = './CryptoModel/model_%i.pkl'
reward_on_value = True
lookback_n = 7

config_max_step = model_retrain_interval

if reward_on_value:
    reward_scaling = 2 ** -10
else:
    reward_scaling = 2 ** -5

In [7]:
from test_env.single_crypto_env import CryptoTradingEnv

from stable_baselines3 import PPO, DDPG
from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan, VecNormalize
from stable_baselines3.common.logger import configure

In [8]:
tmp_path = "./tmp/sb3_log/"
# set up logger
new_logger = configure(tmp_path, ["stdout", "csv"])

Logging to ./tmp/sb3_log/


In [9]:
def modelTraining(time_idx, px_df):
    # create env
    config = dict()

    config['price_array'] = px_df.iloc[:time_idx][['open', 'adjcp', 'low', 'high']].values
    config['tech_array'] = px_df.iloc[:time_idx][tech_indicators].values
    # randomly inital states for training
    config['if_sequence'] = False
    config['if_randomV'] = True
    config['if_value'] = reward_on_value
    config['lookback_n'] = lookback_n

    initial_capital = 1e-5
    initial_stocks = np.array([40.0])
    max_step = config_max_step
    
    crypto_env = CryptoTradingEnv(config, 
                              initial_capital=initial_capital,
                              initial_stocks=initial_stocks,
                              max_step = max_step, 
                              reward_scaling = reward_scaling
                              )
    
    env_train = DummyVecEnv([lambda : crypto_env])
    env_train = VecCheckNan(env_train, raise_exception=True)
    #env_train = VecNormalize(env_train)

    model = DDPG("MlpPolicy", env_train, learning_rate=0.00025, 
                     batch_size=128, gamma = 0.99, seed=312)

    model.set_logger(new_logger)
    
    model.learn(total_timesteps=5e3, tb_log_name = 'ddpg', log_interval=1000)
    print('Training finished!')
    
    model.save(cwd%(time_idx))
    print('Trained model saved in ' + str(cwd%(time_idx)))
    return cwd%(time_idx)

In [10]:
def modelRun(start_idx, px_df, input_amount, input_stocks, last_model):
    test_config = dict()

    test_config['price_array'] = px_df.iloc[:(start_idx + config_max_step)][['open', 'adjcp', 'low', 'high']].values
    test_config['tech_array'] = px_df.iloc[:(start_idx + config_max_step)][tech_indicators].values

    #randomly start day index for back testing
    test_config['if_sequence'] = True
    # disable random initial capital 
    test_config['if_randomV'] = False

    test_config['if_value'] = reward_on_value
    test_config['lookback_n'] = lookback_n

    max_step = min(config_max_step, px_df.shape[0] - start_idx) - 1
    
    print ('Run model from ', start_idx, ' to ', start_idx + max_step)
    
    test_env = CryptoTradingEnv(test_config, \
                            initial_capital=input_amount, \
                            max_step = max_step, \
                           initial_stocks = input_stocks, 
                           reward_scaling = reward_scaling, \
                            start_idx = start_idx)
    state = test_env.reset()

    #test_model = PPO.load(cwd)
    test_model = DDPG.load(last_model)
    test_model = test_model.policy.eval()
    
    done = False  
    while not done:
        action = test_model.predict(state)[0]
        state, reward, done, _ = test_env.step(action)
        
    return test_env.amount, test_env.stocks

### Train first model

In [11]:
test_amount = 0.01
test_stocks = np.array([40.0])

for t in range(start_trade_index, stock_history_df.shape[0], model_retrain_interval):
    print ('Training model at time ', t)
    model_file = modelTraining(t, stock_history_df)
    
    print ('Applying model')
    test_amount, test_stocks = modelRun(t, stock_history_df, test_amount, test_stocks, model_file)

Training model at time  250
Training finished!
Trained model saved in ./CryptoModel/model_250.pkl
Applying model
Run model from  250  to  279
initial stock: [40.] inital amount:  0.01
initial asset:  14106.949453125
[Day 251] BUY: 0.0
[Day 252] SELL: 15.5364
[Day 254] SELL: 24.460700000000003
[Day 255] SELL: 0.0028
[Day 256] BUY: 37.1059
[Day 257] BUY: 0.015600000000000001
[Day 258] BUY: 0.0001
[Day 259] BUY: 0.0
[Day 260] BUY: 0.0
[Day 261] BUY: 0.0
[Day 262] BUY: 0.0
[Day 263] BUY: 0.0
[Day 266] BUY: 0.0
[Day 267] BUY: 0.0
[Day 269] BUY: 0.0
[Day 270] BUY: 0.0
[Day 271] BUY: 0.0
[Day 272] BUY: 0.0
[Day 273] BUY: 0.0
[Day 274] BUY: 0.0
[Day 276] BUY: 0.0
[Day 277] BUY: 0.0
[Day 279] BUY: 0.0
Episode Return:  0.8973805758585037
Training model at time  280
Training finished!
Trained model saved in ./CryptoModel/model_280.pkl
Applying model
Run model from  280  to  309
initial stock: [37.121696] inital amount:  0.019220727116744753
initial asset:  12700.085626977117
[Day 281] BUY: 0.0
[D

KeyboardInterrupt: 