In [None]:
import os
os.chdir('..')

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import pickle
import pandas_market_calendars as mcal
from tqdm import tqdm
import torch
torch.set_float32_matmul_precision('high')
from gymnasium import spaces
from mmd.env import GenLSTM, MMDSimulator, load_generator
from mmd.train import start_writer, get_params_from_events, get_params_dicts, get_robustq_params_dicts, train_robustdqn, training_info
from mmd.evaluation import simulate_agent_spx
from agent.q import QFunc
from agent.DQN import PORDQN

In [3]:
total_length = 560
burn_in = 500
state_len = 60
cal_start_date = '1995-01-01'
cal_end_date = '2023-12-31'
trading_calendar = 'NYSE'
calendar = mcal.get_calendar(trading_calendar)
schedule = calendar.schedule(start_date=cal_start_date, end_date=cal_end_date)

int_rate = 0.024
trans_cost = 0.0025 # standard cost = 0.0005
eval_batch_size = 1000
eval_seed = 12345

In [4]:
with open('./dataset/lstm/ma_params.pkl', 'rb') as f:
    ma_model_params = pickle.load(f)

In [5]:
events_path = './dataset/lstm/'
params = get_params_from_events(events_path)
for key, value in params.items():
    for key, value in value.items():
        if key in globals(): continue # skip if already in globals
        globals()[key] = value
data_params, model_params, train_params = get_params_dicts(vars().copy())

In [6]:
batch_size = 8
device = 'cpu'
action_space = spaces.Discrete(9)
action_values = torch.linspace(-1., 1., 9, device=device)
num_actions = len(action_values)
nu_dist = 't'
nu_scale = 0.03
nu_df = 2
other_state_vars = ['log_wealth', 'positions', 'dt']
obs_dim = state_len + len(other_state_vars)

discount = 0.99
eps_greedy = 0.1 # epsilon greedy parameter
buffer_max_length = int(1e5)
clone_steps = 50
train_steps = 1
agent_batch_size = 128
n_batches = 1
n_epochs = 1
robustq_lr = 1e-4
architecture = [64, 64]
pre_train_Q = False
n_episodes = 3

norm_ord = 1
lamda_init = 0. # initial lambda
lamda_max_iter = 100
lamda_step_size = 10 # step size for learning rate scheduler
lamda_gamma = None # gamma for learning rate scheduler
lamda_lr = [0.02 * (10**i) for i in range(lamda_step_size)]
n_outer = 1 # not used in this algorithm but used in logging by writer
n_inner = 1000 # number of samples from nu to calc inner expectations

In [7]:
delta = 1e-4 # regularisation parameter for Sinkhorn distance
epsilon = 3e-3 # Sinkhorn distance

# also change these if delta and epsilon are changed
delta_str = "1e-4"
eps_str = "3e-3"

simulator_params, model_params = get_robustq_params_dicts(vars().copy())

seeds = [0,1,2,3,4]

In [8]:
seed = seeds[0]
name = "0"
writer = start_writer(simulator_params, model_params, model_name=name)

torch.manual_seed(seed)

generator = GenLSTM(4, 1, 60)
generator = load_generator(generator, events_path)
robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)
robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

Episode 1: 100%|██████████| 6739/6739 [22:39<00:00,  4.96it/s]


Episode 1 mean of summed rewards: -1.937


Episode 2: 100%|██████████| 6739/6739 [24:22<00:00,  4.61it/s]


Episode 2 mean of summed rewards: -0.554


Episode 3: 100%|██████████| 6739/6739 [22:17<00:00,  5.04it/s]


Episode 3 mean of summed rewards: -0.261


In [9]:
seed = seeds[1]
name = "1"
writer = start_writer(simulator_params, model_params, model_name=name)

torch.manual_seed(seed)

generator = GenLSTM(4, 1, 60)
generator = load_generator(generator, events_path)
robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)
robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

Episode 1: 100%|██████████| 6739/6739 [22:36<00:00,  4.97it/s]


Episode 1 mean of summed rewards: -1.928


Episode 2: 100%|██████████| 6739/6739 [22:42<00:00,  4.95it/s]


Episode 2 mean of summed rewards: -0.700


Episode 3: 100%|██████████| 6739/6739 [22:16<00:00,  5.04it/s]


Episode 3 mean of summed rewards: -0.187


In [None]:
seed = seeds[2]
name = f"newlr_delta_{delta_str}_eps_{eps_str}_seed_{seed}_txn_0.05"
writer = start_writer(simulator_params, model_params, model_name=name)

torch.manual_seed(seed)

generator = GenLSTM(4, 1, 60)
generator = load_generator(generator, events_path)
robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)
robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

In [8]:
seed = seeds[3]
name = f"newlr_delta_{delta_str}_eps_{eps_str}_seed_{seed}_txn_0.25"
writer = start_writer(simulator_params, model_params, model_name=name)

torch.manual_seed(seed)

generator = GenLSTM(4, 1, 60)
generator = load_generator(generator, events_path)
robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)
robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params, "runs/newlr_delta_1e-4_eps_3e-3_seed_3_txn_0.05/checkpoint.pt")

Episode 3: 100%|██████████| 6739/6739 [24:25<00:00,  4.60it/s]


Episode 3 mean of summed rewards: -1.064


In [14]:
seed = seeds[4]
name = "4"
writer = start_writer(simulator_params, model_params, model_name=name)

torch.manual_seed(seed)

generator = GenLSTM(4, 1, 60)
generator = load_generator(generator, events_path)
robustq = QFunc(state_len+len(other_state_vars), architecture, action_values.shape[0]).to(device)

env = MMDSimulator(generator, ma_model_params, trading_calendar, cal_start_date, cal_end_date, state_len, burn_in,int_rate, trans_cost, batch_size, action_space, action_values, device)
robustdqn_agent = PORDQN(obs_dim, num_actions, discount, nu_scale, nu_df, action_values, epsilon, delta, n_inner, lamda_init,lamda_lr, lamda_max_iter, lamda_step_size, lamda_gamma, norm_ord, robustq, eps_greedy, buffer_max_length, clone_steps, train_steps, agent_batch_size, n_batches, n_epochs, robustq_lr, device=device, seed=seed, writer=writer)

robustdqn_agent = train_robustdqn(robustdqn_agent, env, writer, simulator_params, model_params)

Episode 1: 100%|██████████| 6739/6739 [22:21<00:00,  5.02it/s]


Episode 1 mean of summed rewards: -1.458


Episode 2: 100%|██████████| 6739/6739 [24:57<00:00,  4.50it/s]


Episode 2 mean of summed rewards: -0.724


Episode 3: 100%|██████████| 6739/6739 [22:24<00:00,  5.01it/s]


Episode 3 mean of summed rewards: -0.224
