In [1]:
%cd ../..

E:\システムトレード入門\trade_system_git_workspace


In [2]:
from tqdm.notebook import tqdm

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytz import timezone
import datetime
from pathlib import Path
import pandas as pd
import numpy as np
import pfrl

In [4]:
from scipy.special import softmax

In [5]:
from get_stock_price import StockDatabase

In [6]:
from portfolio.trade_transformer import PortfolioTransformer, PortfolioRestrictorIdentity, FeeCalculatorFree
from portfolio.price_supply import StockDBPriceSupplier

In [7]:
from portfolio.rl_base.envs import TradeEnv, TickerSampler, DatetimeSampler, SamplerManager, PortfolioVectorSampler
from portfolio.rl_base.basis_func import ComposeFunction, PriceNormalizeConst, MeanCostPriceNormalizeConst, State2Feature

In [27]:
from visualization import visualize_portfolio_transform_bokeh

## グローバルパラメータ―

In [8]:
jst = timezone("Asia/Tokyo")
start_datetime = jst.localize(datetime.datetime(2020,11,10,0,0,0))
end_datetime = jst.localize(datetime.datetime(2020,11,20,0,0,0))
ticker_number = 9
window = np.arange(0,20)
episode_length = 300
freq_str = "5T"

## 環境の作成 

In [9]:
db_path = Path("db/sub_stock_db/nikkei_255_stock.db")

ticker_codes_df = pd.read_csv(Path("portfolio/rl_base/nikkei225_modified.csv"), header=0)  # 自分で作成
ticker_codes = ticker_codes_df["code"].values.astype(str).tolist()

In [10]:
# stock_db
stock_db = StockDatabase(db_path)

# sampler
ticker_names_sampler = TickerSampler(all_ticker_names=ticker_codes,
                                     sampling_ticker_number=ticker_number)



start_datetime_sampler = DatetimeSampler(start_datetime=start_datetime,
                                         end_datetime=end_datetime,
                                         episode_length=episode_length,
                                         freq_str=freq_str
                                        )

sampler_manager = SamplerManager(ticker_names_sampler=ticker_names_sampler,
                                 datetime_sampler=start_datetime_sampler
                                )


# PriceSupplierの設定
price_supplier = StockDBPriceSupplier(stock_db,
                                      [],  # 最初は何の銘柄コードも指定しない
                                      episode_length,
                                      freq_str,
                                      interpolate=True
                                     )

# PortfolioTransformerの設定
portfolio_transformer = PortfolioTransformer(price_supplier,
                                             portfolio_restrictor=PortfolioRestrictorIdentity(),
                                             use_ohlc="Close",
                                             initial_all_assets=1e6,  # 学習には関係ない
                                             fee_calculator=FeeCalculatorFree()
                                            )

# TradeEnvの設定
trade_env = TradeEnv(portfolio_transformer,
                     sampler_manager,
                     window=window,
                     fee_const=0.0025
                    )

In [11]:
portfolio_state, _,_,_ = trade_env.reset()

In [12]:
portfolio_state.price_array.shape

(10, 20)

## 前処理の設定 

In [13]:
state_transform = ComposeFunction({"price_normalizer":PriceNormalizeConst(None),
                                   "mca_normalizer":MeanCostPriceNormalizeConst(None),
                                   "state2feature":State2Feature()
                                  })

##  モデルの定義 

今回は決定論的な方策を利用する

In [14]:
class DPolicy(nn.Module):
    def __init__(self, in_channels=3, out_number=10):
        super(DPolicy, self).__init__()
        self.out_number = out_number
        
        self.conv1 = nn.Conv2d(in_channels, 6, kernel_size=5, padding=2, stride=2)
        self.bn1 = nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(6, self.out_number, kernel_size=(4,6), padding=2, stride=(2,3))
        self.avgpool = nn.AvgPool2d(kernel_size=3)
        self.head = pfrl.policies.DeterministicHead()
        
        
    def forward(self, x):
        #from IPython.core.debugger import Pdb; Pdb().set_trace()
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.conv2(x)
        x = self.avgpool(x)
        x = torch.reshape(x, (-1, self.out_number))
        x = F.softmax(x, dim=-1)
        out = self.head(x)
        return out

In [15]:
random_x = torch.randn((30, 3, 10, 20))
policy = DPolicy(3, 10)

In [16]:
out = policy(random_x)
print(out.batch_shape, out.event_shape)
print(out.rsample(torch.Size([])).shape)

torch.Size([30]) torch.Size([10])
torch.Size([30, 10])


In [17]:
class QFunc(nn.Module):
    def __init__(self, in_channels=3, out_number=10):
        super(QFunc, self).__init__()
        self.out_number = out_number
        
        self.conv1 = nn.Conv2d(in_channels, 6, kernel_size=5, padding=2, stride=2)
        self.bn1 = nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(6, self.out_number, kernel_size=(4,6), padding=2, stride=(2,3))
        self.bn2 = nn.BatchNorm2d(self.out_number)
        self.avgpool = nn.AvgPool2d(kernel_size=3)
        
        self.concat = pfrl.nn.ConcatObsAndAction()
        self.fc1 = nn.Linear(self.out_number*2, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.head = nn.Linear(64, 1)
        
    def forward(self, obs_and_action):
        #from IPython.core.debugger import Pdb; Pdb().set_trace()
        obs, action = obs_and_action
        x = F.relu(self.bn1(self.conv1(obs)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.avgpool(x)
        x = torch.reshape(x, (-1, self.out_number))
        x = self.concat((x, action))
        x = F.relu(self.bn3(self.fc1(x)))
        out = self.head(x)
        return out

In [18]:
random_obs = torch.randn(30, 3, 10, 20)
random_action = torch.randn(30, 10)

In [19]:
qfunc = QFunc(3, 10)
out = qfunc((random_obs, random_action))

In [20]:
out.shape

torch.Size([30, 1])

## エージェントの作成 

In [21]:
a = PortfolioVectorSampler(ticker_number+1).sample

In [22]:
policy = DPolicy(in_channels=3, out_number=ticker_number+1)
q_func = QFunc(in_channels=3, out_number=ticker_number+1)

opt_p = torch.optim.Adam(policy.parameters())
opt_q = torch.optim.Adam(q_func.parameters())

rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6)

#explorer = pfrl.explorers.AdditiveGaussian(
#    scale=np.array([0.1]*(ticker_number+1)), low=np.array([0]*(ticker_number+1)), high=np.array([1]*(ticker_number+1))
#)

def action_sample():
    portfolio_vector = PortfolioVectorSampler(ticker_number+1).sample()
    return portfolio_vector.astype(np.float32)
    

explorer = pfrl.explorers.ConstantEpsilonGreedy(epsilon=0.3,
                                                random_action_func=action_sample
                                               )

def burnin_action_func():
    """Select random actions until model is updated one or more times."""
    random_x = np.random.uniform(np.array([0]*(ticker_number+1)), np.array([1]*(ticker_number+1)))
    out = softmax(random_x).astype(np.float32)
    return out
    #return np.random.uniform(np.array([0]), np.array([1])).astype(np.float32)
    
gpu = -1

phi = lambda x: x.astype(np.float32, copy=False)

ddpg_agent = pfrl.agents.DDPG(
    policy,
    q_func,
    opt_p,
    opt_q,
    rbuf,
    phi=phi,
    gamma=0.99,
    explorer=explorer,
    replay_start_size=10000,
    target_update_method="soft",
    target_update_interval=1,
    update_interval=1,
    soft_update_tau=5e-3,
    n_times_update=1,
    gpu=gpu,
    minibatch_size=16,
    burnin_action_func=burnin_action_func,
)

## 学習のための関数 

In [32]:
def episode(env,
            agent,
            state_transform,
            return_state_reward=True,
            field_list=["now_price_array", "portfoilo_vector", "mean_cost_price_array", "all_assets", "datetime"],
            seed=None,
            print_span=None,
            is_observe=True):
    
    state_list = []
    reward_list = []
    
    portfolio_state,reward,_,_ = env.reset(seed)

    state_list.append(portfolio_state.partial(*field_list))
    reward_list.append(reward)
    
    R = 0
    t = 1
    
    obs = state_transform(portfolio_state)
    
    if print_span is not None:
        print("initial:, all_assets:{}".format(portfolio_state.all_assets))

    while True:
        action = agent.act(obs)
        portfolio_state, reward, done, info = env.step(action)

        state_list.append(portfolio_state.partial(*field_list))
        reward_list.append(reward)

        R += reward
        t += 1
        
        # state前処理
        obs = state_transform(portfolio_state)

        if is_observe:  # 観測(学習)する場合
            agent.observe(obs, reward, done, reset)

        if done or reset:
            break
        if print_span is not None:
            if t%print_span==0:
                print("\tt={}:, all_assets:{}".format(t,portfolio_state.all_assets))
    
    if print_span is not None:
        print("finished(t={}):, all_assets:{}".format(t, portfolio_state.all_assets))
    
    if return_state_reward:
        return state_list, reward_list

## 学習のパラメータ― 

In [23]:
n_episodes = 1000

## 学習のイテレーション 

In [24]:
for i in tqdm(range(1, n_episodes + 1)):
    #from IPython.core.debugger import Pdb; Pdb().set_trace()
    portfolio_state, reward, done, _ = trade_env.reset()  # 観測のリセット
    
    #state_transformの設定
    state_transform.price_normalizer.const_array = portfolio_state.now_price_array
    state_transform.mca_normalizer.const_array = portfolio_state.now_price_array
    
    #obsの前処理
    obs = state_transform(portfolio_state)
    
    R = 0  # Return (sum of rewards)
    t = 0  # time step
    reset = False
    while True:
        action = ddpg_agent.act(obs)
        portfolio_state, reward, done, _ = trade_env.step(action)
        R += reward
        t += 1
        
        #obsの前処理
        obs = state_transform(portfolio_state)
        
        ddpg_agent.observe(obs, reward, done, reset)
        if done:
            break
    
    if i%50 == 0:
        print("episode:{}, return:{}".format(i, R))
    if i%100 == 0:
        print("statistics:", ddpg_agent.get_statistics())
        
print("Finshed")

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

episode:50, return:-2.5373073159471984
episode:100, return:2.8179287579007335
statistics: [('average_q', 1190.3986), ('average_actor_loss', -1182.85576171875), ('average_critic_loss', 156036.4828692627), ('n_updates', 20001)]
episode:150, return:-7.096078257791479
episode:200, return:1.0051607679437173
statistics: [('average_q', 11705.611), ('average_actor_loss', -11696.452080078125), ('average_critic_loss', 3356313.3971484373), ('n_updates', 50001)]
episode:250, return:-1.2572282760410673
episode:300, return:-5.675385110644565
statistics: [('average_q', 31812.72), ('average_actor_loss', -31818.08611328125), ('average_critic_loss', 35860090.96359375), ('n_updates', 80001)]
episode:350, return:1.5167041286263223
episode:400, return:4.601706630777761
statistics: [('average_q', 51735.85), ('average_actor_loss', -51737.36796875), ('average_critic_loss', 59193258.204375), ('n_updates', 110001)]
episode:450, return:-0.7664046710155491
episode:500, return:0.11649969583892476
statistics: [('av

## 学習結果の可視化

In [35]:
with ddpg_agent.eval_mode():
    state_list, reward_list = episode(trade_env, 
                                      ddpg_agent,
                                      state_transform,
                                      return_state_reward=True,
                                      field_list=["names", "now_price_array", "portfolio_vector", "mean_cost_price_array", "all_assets", "datetime"],
                                      is_observe=False
                                     )
    
visualize_portfolio_transform_bokeh(state_list)