In [1]:
import sys
sys.path.append(r"E:\システムトレード入門\tutorials\rl\pfrl")
sys.path.append(r"E:\システムトレード入門\trade_system_git_workspace")

In [2]:
import numpy as np
import collections
from bokeh.io import output_notebook
output_notebook()

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
import datetime
from pytz import timezone

In [5]:
from pathlib import Path

In [48]:
from contextlib import contextmanager

In [6]:
import pfrl

In [7]:
from get_stock_price import StockDatabase

In [8]:
from envs_ver2 import OneStockEnv, NormalizeState, NormalizeReward

In [9]:
from visualize_trading_process_ver1 import plot_trading_process_matplotlib, plot_trading_process_bokeh

In [10]:
db_path = Path("E:/システムトレード入門/trade_system_git_workspace/db/stock_db") / Path("stock.db")
stock_db = StockDatabase(db_path)

### 環境の作成 

In [11]:
jst_timezone = timezone("Asia/Tokyo")
start_datetime = jst_timezone.localize(datetime.datetime(2020,11,1,0,0,0))
end_datetime = jst_timezone.localize(datetime.datetime(2020,12,1,0,0,0))
#end_datetime = get_next_workday_jp(start_datetime, days=11)  # 営業日で一週間(5日間)

stock_names = "4755"
#stock_names = "9984"
#stock_names = ["6502","4755"]
#stock_list = ["4755","9984","6701","7203","7267"]

use_ohlc="Close"

In [12]:
initial_cash = 1.e6
initial_unit = 50

freq_str = "5T"
episode_length = 12*5*7  # 1週間

#state_time_list = [0,1,12,12*3,12*5,12*5*3],  # [現在，次時刻，一時間後，3時間後，5時間後(1日後), 15時間後(3日後)]
state_time_list = [0,
                   1,
                   2,
                   6,
                   12,
                   12*2,
                   12*3,
                   12*4,
                   12*5*1,
                   12*5*2,
                   12*5*3,
                   12*5*4,
                   12*5*5,
                   ]  # 現在，5分後, 10分後, 30分後, 1時間後, 2時間後, 3時間後, 4時間後, 1日後, 2日後, 3日後, 4日後, 5日後, 6日後, 7日後

one_unit_stocks = 20
max_units_number = 5
stay_penalty_unit_bound=30


env = OneStockEnv(stock_db,
                  stock_names=stock_names,
                  start_datetime=start_datetime,
                  end_datetime=end_datetime,
                  freq_str="5T",
                  episode_length=episode_length,  # 一週間
                  state_time_list=state_time_list,
                  use_ohlc=use_ohlc,  # 終値を使う
                  initial_cash=initial_cash,  # 種銭：100万円,
                  initial_unit=initial_unit,
                  use_view=False,
                  one_unit_stocks=one_unit_stocks,  # 独自単元数
                  max_units_number=max_units_number,  # 一度に売買できる独自単元数
                  low_limmit=1.e4,  # 全財産がこの値以下になれば終了
                  interpolate=True,
                  stay_penalty_unit_bound=stay_penalty_unit_bound  # このunit数以下の場合のstayはペナルティ
                 )

### ルールベースの方策を持つエージェント 

In [54]:
class RuleAgent():
    def __init__(self, state_time_list, action_size):
        self.state_time_array = np.array(state_time_list)
        if action_size % 2 == 0:  # 偶数の倍
            raise Exception("action size must be odd")
        
        self.sell_index = 0  # 最大限売るインデックス
        self.stay_index = action_size // 2  # 真ん中のインデックス
        self.buy_index = action_size - 1  # 最大限買うインデックス
        
        self.gradient_bound = 0.05
        
    def act(self, observe):
        cash = observe.cash
        unit_number = observe.unit_number
        vote_value = 0  # 投票の値
        
        # 平均取得価格による投票
        mean_cost_price = observe.mean_cost_price
        now_price = observe.now_price
        if mean_cost_price > now_price:  # 買いに投票
            vote_value += 1
        elif mean_cost_price == now_price:  # ステイに投票
            vote_value += 0
        elif mean_cost_price < now_price:  # 売りに投票
            vote_value += -1
            
        # 線形回帰によるトレンドで判定
        price_array = observe.price_array.copy()  # 一応コピー
    
        A = np.stack([self.state_time_array, np.ones(len(self.state_time_array))],axis=1)
        inv_AtA = np.linalg.inv(np.dot(A.T,A))  # 必ず正則になるので，逆行列可能
        Atb = np.dot(A.T, price_array)
        x_hat = np.dot(inv_AtA, Atb)
        
        #傾き
        a = x_hat[0]
        if a > abs(self.gradient_bound):  # 上昇トレンドの場合，買いに投票
            vote_value += 1
        elif a < -abs(self.gradient_bound):  # 下降トレンドの場合，売りに投票
            vote_value += -1
        else:  # それ以外はステイ
            vote_value += 0
            
        # 投票を基に，行動を返す
        if vote_value > 0 and not cash <= 0:  # 買いと判定
            return np.int32(self.buy_index)
        elif vote_value < 0 and not unit_number <= 0:  # 売りと判定
            return np.int32(self.sell_index)
        else:
            return np.int32(self.stay_index)
    def observe(self, observe, reward, done, reset):
        pass
    
    @contextmanager
    def eval_mode(self):
        yield None
        pass

### エージェントの作成 

In [55]:
rule_agent = RuleAgent(state_time_list=state_time_list, action_size=env.action_space.n)

### 描画する関数 

In [56]:
def episode(env, agent, state_transform=None, reward_transform=None, print_span=None, is_observe=True):
    state_list = []
    info_list = []
    action_list = []
    
    obs,_,_,info = env.reset()

    state_list.append(obs)
    info_list.append(info)
    R = 0
    t = 1
    if print_span is not None:
        print("\tt:{},all_property:{}, unit_number:{}, price:{}, penalty:{}, cash:{}".format(t,
                                                                                             info["all_property"],
                                                                                             obs.unit_number,
                                                                                             obs.now_price,
                                                                                             info["penalty"],
                                                                                             obs.cash
                                                                                            ))
    
    if state_transform is not None:
        normalized_obs = state_transform(obs)
    else:
        normalized_obs = obs

    while True:
        action = agent.act(normalized_obs)
        action_list.append(action)
        obs, reward, done, info = env.step(action)
        R += reward
        t += 1
        reset = False


        # state, rewardの前処理
        if state_transform is not None:
            normalized_obs = state_transform(obs)
        else:
            normalized_obs = obs
        if reward_transform is not None:
            normalized_reward = reward_transform(reward)
        else:
            normalized_reward = reward

        if is_observe:  # 観測(学習)する場合
            agent.observe(normalized_obs, normalized_reward, done, reset)

        state_list.append(obs)
        info_list.append(info)

        if done or reset:
            break
        if print_span is not None:
            if t%print_span==0:
                print("\tt:{},all_property:{}, unit_number:{}, price:{}, penalty:{}, cash:{}".format(t,
                                                                                                     info["all_property"],
                                                                                                     obs.unit_number,
                                                                                                     obs.now_price,
                                                                                                     info["penalty"],
                                                                                                     obs.cash
                                                                                                    ))
                print("\taction_counter:",collections.Counter(action_list))
    
    if print_span is not None:
        print("\tt:{},all_property:{}, unit_number:{}, price:{}, penalty:{}, cash:{}".format(t,
                                                                                             info["all_property"],
                                                                                             obs.unit_number,
                                                                                             obs.now_price,
                                                                                             info["penalty"],
                                                                                             obs.cash
                                                                                            ))
        print("\taction_counter:",collections.Counter(action_list))
        print("finished. episode length: {}".format(t))
    return state_list, info_list, action_list

In [69]:
env.seed(10)
with rule_agent.eval_mode():
    state_list, info_list, _ = episode(env, rule_agent, state_transform=None, reward_transform=None, print_span=100, is_observe=False)

	t:1,all_property:2122000.0, unit_number:50, price:1122.0, penalty:0, cash:1000000.0
	t:100,all_property:2141400.0, unit_number:50, price:1097.0, penalty:0, cash:1044400.0
	action_counter: Counter({5: 53, 0: 23, 10: 23})
	t:200,all_property:2142100.0, unit_number:0, price:1130.0, penalty:-200000.0, cash:2142100.0
	action_counter: Counter({5: 143, 0: 33, 10: 23})
	t:300,all_property:2143500.0, unit_number:0, price:1123.0, penalty:-200000.0, cash:2143500.0
	action_counter: Counter({5: 225, 0: 42, 10: 32})
	t:400,all_property:2124500.0, unit_number:100, price:1100.0, penalty:-200000.0, cash:-75500.0
	action_counter: Counter({5: 297, 10: 56, 0: 46})
	t:420,all_property:2122500.0, unit_number:100, price:1099.0, penalty:-200000.0, cash:-75500.0
	action_counter: Counter({5: 317, 10: 56, 0: 46})
finished. episode length: 420


### 売買過程を描画 

In [70]:
plot_trading_process_bokeh(state_list, info_list, env, is_save=False)

### 強化学習のエージェント

In [59]:
class QFunction(nn.Module):
    def __init__(self, obs_size, n_actions):
        super().__init__()
        self.fc1 = nn.Linear(obs_size, 32)
        self.bn1 = nn.BatchNorm1d(32)
        
        self.fc2 = nn.Linear(32, 128)
        self.bn2 = nn.BatchNorm1d(128)
        
        self.fc3 = nn.Linear(128, 256)
        self.bn3 = nn.BatchNorm1d(256)
        
        self.fc4 = nn.Linear(256, n_actions)
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = F.relu(self.bn3(self.fc3(x)))
        
        x = self.fc4(x)
        out = pfrl.action_value.DiscreteActionValue(x)
        return out

In [60]:
obs_size = env.observation_space.low.size
print("observation size:", obs_size)
n_actions = env.action_space.n
print("action size:",n_actions)
q_func = QFunction(obs_size, n_actions)

observation size: 16
action size: 11


In [62]:
optimizer = torch.optim.Adam(q_func.parameters(), eps=1e-3)

gamma = 0.95

init_episilon = 0.3
init_explorer = pfrl.explorers.ConstantEpsilonGreedy(epsilon=init_episilon,
                                                random_action_func=env.action_space.sample
                                               )

replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**6)


def phi_func(observe):
    observe_array = observe.to_numpy()
    return observe_array.astype(np.float32, copy=False)


phi = phi_func

gpu = -1 # -1 is cpu

rl_agent = pfrl.agents.DoubleDQN(
      q_function=q_func,
      optimizer=optimizer,
      replay_buffer=replay_buffer,
      gamma=gamma,
      explorer=init_explorer,
      replay_start_size=500,
      update_interval=1,
      target_update_interval=100,
      phi=phi,
      gpu=gpu
)

#### エージェントのロード 

In [63]:
folder_name = "2020_12_25__04_06_26"
load_path = Path("agents") / Path(folder_name)
rl_agent.load(load_path)

#### 前処理用のクラス 

In [64]:
state_transform = NormalizeState(cash_const=initial_cash,
                                 unit_const=100,
                                 price_const=1.e4,
                                )

reward_transform = NormalizeReward(reward_const=5.e3,
                                  )

In [67]:
env.seed(10)
with rl_agent.eval_mode():
    state_list, info_list, _ = episode(env, rl_agent, state_transform=state_transform, reward_transform=None, print_span=100, is_observe=False)

	t:1,all_property:2122000.0, unit_number:50, price:1122.0, penalty:0, cash:1000000.0
	t:100,all_property:2077640.0, unit_number:97, price:1097.0, penalty:-500000.0, cash:-50540.0
	action_counter: Counter({6: 77, 0: 14, 10: 8})
	t:200,all_property:2123780.0, unit_number:25, price:1130.0, penalty:0, cash:1558780.0
	action_counter: Counter({6: 114, 0: 42, 7: 24, 10: 10, 2: 9})
	t:300,all_property:2142440.0, unit_number:19, price:1123.0, penalty:0, cash:1715700.0
	action_counter: Counter({6: 153, 0: 69, 7: 40, 10: 24, 2: 13})
	t:400,all_property:2109260.0, unit_number:95, price:1100.0, penalty:0, cash:19260.0
	action_counter: Counter({6: 192, 0: 93, 10: 53, 7: 46, 2: 14, 8: 1})
	t:420,all_property:2107520.0, unit_number:97, price:1099.0, penalty:-500000.0, cash:-24540.0
	action_counter: Counter({6: 209, 0: 96, 10: 53, 7: 46, 2: 14, 8: 1})
finished. episode length: 420


In [68]:
plot_trading_process_bokeh(state_list, info_list, env, is_save=False)