In [1]:
import os
import random
from typing import Dict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ray
from ray import tune, air
from ray.tune import JupyterNotebookReporter
from ray.tune.logger import TBXLoggerCallback
from ray.rllib.algorithms import AlgorithmConfig
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation import Episode, RolloutWorker
from ray.rllib.evaluation.episode_v2 import EpisodeV2
from ray.rllib.policy import Policy
from scipy.special import softmax
import seaborn as sns
import torch

from stocktradingv2.agent.mysac import MySAC, MySACConfig
from stocktradingv2.env.MultiStockTradingEnv import MultiStockTradingEnv

In [2]:
ray.init()

2023-03-13 23:07:55,374	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Python version:,3.8.10
Ray version:,2.3.0


In [3]:
dfs = []
norm_dict = {}
for root, dirs, files in os.walk("./datasets/SSE50/"):
    for file in files:
        path = os.path.join("./datasets/SSE50/", file)
        df = pd.read_csv(path)
        tic = file.split(".")[0]
        norm_dict[tic] = {}
        for col in df.columns:
            if col != "date" and col != "change":
                mean = df[col].mean()
                std = df[col].std()
                norm_dict[tic][col] = (mean, std)
                df[col] = (df[col] - mean) / std
        df["tic"] = tic
        dfs.append((file, df))

In [4]:
dfs[0][1].head()

Unnamed: 0,date,open,high,low,close,volume,cr-ma3,cci,cr-ma1,cr-ma2,change,ppoh,kdjj_9,ppo,ppos,stochrsi,tic
0,2008-01-04,0.134484,0.120257,0.189359,0.111248,0.145689,-0.041534,0.891217,-0.03618,-0.038733,0.084145,0.037506,0.865201,0.102169,0.096538,-1.35427,600028
1,2008-01-07,0.107899,0.08754,0.129928,0.053939,0.097942,-0.041534,0.379385,-0.03618,-0.038733,-0.84033,-7.5e-05,0.589577,0.090313,0.096549,-1.35427,600028
2,2008-01-08,0.144799,0.226761,0.209668,0.19458,1.045147,-0.041534,1.230587,0.352515,-0.038733,2.627095,0.096666,0.747043,0.133801,0.110381,-0.014931,600028
3,2008-01-09,0.167244,0.158962,0.209217,0.20815,-0.543675,-0.041534,0.901763,1.064898,-0.038733,0.660593,0.159138,0.988005,0.172228,0.130368,0.154131,600028
4,2008-01-10,0.201566,0.087709,0.079126,0.021312,1.081671,-0.041534,0.045699,1.128115,0.369917,-2.789157,0.025173,0.324555,0.132636,0.133262,-1.35427,600028


In [5]:
# SEED = 114
# random.seed(SEED)
# np.random.seed(SEED)
# torch.manual_seed(SEED)

In [6]:
np.random.shuffle(dfs)
dfs = dfs[:10]
tics = " ".join([tic for tic, df in dfs])
print(tics)

600519.SS.csv 600809.SS.csv 601601.SS.csv 600048.SS.csv 600028.SS.csv 601398.SS.csv 600036.SS.csv 600585.SS.csv 600030.SS.csv 600276.SS.csv


In [7]:
test_start = '2017-01-01'
trade_start = '2020-01-01'
test_start = pd.to_datetime(test_start, format='%Y-%m-%d')
trade_start = pd.to_datetime(trade_start, format='%Y-%m-%d')

# split
dfs_train = []
dfs_test = []
dfs_trade = []
for tic, df in dfs:
    df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
    df_train = df.loc[df.date < test_start].sort_index(ascending=True).copy()
    df_test = df.loc[(df.date >= test_start) & (df.date < trade_start)].sort_index(ascending=True).copy()
    df_trade = df.loc[df.date >= trade_start].sort_index(ascending=True).copy()
    dfs_train.append(df_train)
    dfs_test.append(df_test)
    dfs_trade.append(df_trade)

# ensemble
def ensemble(dfs_t):
    dfs_t = pd.concat(dfs_t)
    # drop dates that missing data
    dfs_t = dfs_t.pivot_table(index=['date'], columns=['tic']).dropna().stack().reset_index()
    dfs_t.sort_values(['date', 'tic'], inplace=True)
    dfs_t.set_index(['date', 'tic'], inplace=True)
    # print(dfs_t.head(5))
    return dfs_t
dfs_train = ensemble(dfs_train)
dfs_test = ensemble(dfs_test)
dfs_trade = ensemble(dfs_trade)
dfs_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cci,change,close,cr-ma1,cr-ma2,cr-ma3,high,kdjj_9,low,open,ppo,ppoh,ppos,stochrsi,volume
date,tic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2008-01-04,600028,0.891217,0.084145,0.111248,-0.03618,-0.038733,-0.041534,0.120257,0.865201,0.189359,0.134484,0.102169,0.037506,0.096538,-1.35427,0.145689
2008-01-04,600030,0.531851,4.105417,0.028456,0.675933,0.70744,0.785949,-0.012022,0.70486,-0.034453,-0.136636,0.051929,0.058243,0.036222,1.396025,-0.333455
2008-01-04,600036,-0.289434,1.239447,-0.201373,0.98351,1.031276,1.140823,-0.261869,-0.352393,-0.230817,-0.316123,-0.053209,-0.00387,-0.055171,1.38765,-0.270492
2008-01-04,600048,0.895265,6.017374,-0.081115,-1.134979,-1.194946,-1.319297,-0.086628,1.229594,-0.19058,-0.229146,0.003833,0.108724,-0.031894,1.400567,0.476755
2008-01-04,600276,-1.048409,-4.133497,-0.645581,-0.646653,-0.67266,-0.727341,-0.512934,-0.75378,-0.599402,-0.46008,-0.248435,-0.113372,-0.227642,-1.405323,1.285805


In [9]:
class MyCallbacks(DefaultCallbacks):
    def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
        self._eval_counter = 0
        self._train_counter = 0
        super().__init__(legacy_callbacks_dict)

    def on_episode_end(
        self,
        *,
        worker: RolloutWorker,
        base_env: BaseEnv,
        policies: Dict[str, Policy],
        episode: EpisodeV2,
        env_index: int,
        **kwargs
    ):
        env = base_env.get_sub_environments()[env_index]

        episode.hist_data["asset_memory"] = env.asset_memory
        episode.hist_data["reward_memory"] = env.reward_memory
        episode.custom_metrics["win_count"] = env.win_count
        episode.custom_metrics["total_cost"] = env.total_cost
        episode.custom_metrics["final_asset"] = env.asset_memory[-1]

        # a = np.array(env.action_memory).transpose()
        # a = softmax(a, axis=0)
        # fig, ax = plt.subplots(figsize=(16, 7))
        # plt.stackplot(np.arange(a.shape[1]), a)
        
        # in_eval =  worker.policy_config["in_evaluation"]
        # dqn_type = worker.policy_config["q_model_config"]["type"]
        # if in_eval:
        #     fig.savefig(f"./{dqn_type}_{self._eval_counter}_{episode.episode_id:05d}.png")
        #     self._eval_counter += 1
        # else:
        #     if self._train_counter % 10 == 0:
        #         fig.savefig(f"./{dqn_type}_{self._train_counter}_{episode.episode_id:05d}.png")
        #     self._train_counter += 1

In [10]:
param_space = MySACConfig().to_dict()
param_space.update(
    {
        "framework": "torch",
        "num_gpus": 0.25,        
        "num_workers": 0,
        #"num_cpus_per_worker": 1,


        "env": "MultiStockTrading",
        "env_config": {
            "df": dfs_train,
        # "verbose": True,
        },
        #"seed": 114,

        "policy_model_config": {
            "lstm_dim": 64,
            "net_arch": tune.grid_search([[128, 128], [64, 64]]),
        },
        "q_model_config": {
            "type": "iqn",
            "lstm_dim": 64,
            "num_atoms": 50,
            "net_arch": tune.grid_search([[128, 128], [64, 64]]),
            "num_critics": 1,
            # cqn
            "vmin": -10.0,
            "vmax": 10.,
            # iqn
            "risk_distortion_measure": None,
            "cos_embedding_dim": 64,
        },
        
        #"training_intensity": 1,
        "train_batch_size": 256,
        "tau": 0.01,
        "target_network_update_freq": 1,
        "target_entropy": "auto",
        "n_step": 1,
        "grad_clip": 40,
        "min_sample_timesteps_per_iteration": 200,
        "num_steps_sampled_before_learning_starts": 256,
        "metrics_num_episodes_for_smoothing": 5,
        "num_steps_sampled_before_learning_starts": 256,
        "callbacks": MyCallbacks,

        "optimization": {
            "actor_learning_rate": 0.003,
            "critic_learning_rate": 0.003,
            "entropy_learning_rate": 0.003,
        },


        "evaluation_interval": 3,
        "evaluation_duration": 1,
        "evaluation_duration_unit": "episodes",
        "evaluation_num_workers": 1,
        "evaluation_config": {
            "explore": False,
            "env_config": {
                "df": dfs_test,
            },
        }
    }
)
#param_space

In [11]:
tuner = tune.Tuner(
    MySAC, 
    param_space=param_space,
    tune_config=tune.TuneConfig(num_samples=1),
    run_config=air.RunConfig(
        name="norm_net_arch_with_rsample_disgonal_gaussian",
        progress_reporter=JupyterNotebookReporter(),
        stop={
            "episode_reward_mean": 10,
            "timesteps_total": 1000000,
        },
        callbacks=[TBXLoggerCallback()],
    )
)
results = tuner.fit()

2023-03-14 00:03:52,136	ERROR tune.py:794 -- Trials did not complete: [MySAC_MultiStockTrading_0ab24_00000, MySAC_MultiStockTrading_0ab24_00001, MySAC_MultiStockTrading_0ab24_00002, MySAC_MultiStockTrading_0ab24_00003]
2023-03-14 00:03:52,137	INFO tune.py:798 -- Total run time: 3267.04 seconds (3265.84 seconds for the tuning loop).


In [10]:
# result = results.get_best_result(metric="episode_reward_mean", mode="max")
# cp = result.best_checkpoints[0][0]
# algo = MySAC.from_checkpoint(cp)