<a href="https://colab.research.google.com/github/eunsun53/ReinforcementLearning_predict-of-stock/blob/main/Reinforcement_Learning_prediction_of_stock_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<Reinforcement Learning을 이용한 주식 가격 예측>
> * model- free RL, Q-Learning 적용 
> * agent가 (state, action) 에 따른 reward로 피드백을 받아 최대의 보상을 받을 수 있도록 Q 함수를 업데이트 할 수 있도록 policy를 찾아내도록 하는 것 

In [1]:
#드라이브-코랩 연동 
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
cd /content/gdrive/MyDrive/dataset

/content/gdrive/MyDrive/dataset


In [3]:
cd DQL_trading

/content/gdrive/MyDrive/dataset/DQL_trading


In [4]:
import os 
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 

In [5]:
import tensorflow as tf 
import matplotlib.pyplot as plt 

In [6]:
# load data 
msft = pd.read_csv('daily_MSFT.csv', usecols = ['close']) # 주식 종가만 가져옴 
ibm = pd.read_csv('daily_IBM.csv', usecols = ['close'])
qcom = pd.read_csv('daily_QCOM.csv', usecols=['close'])

In [7]:
msft.head(5)

Unnamed: 0,close
0,85.52
1,85.4
2,85.51
3,85.5
4,85.52


In [8]:
msft['close'].values[::-1] # 데이터 순서 reverse 

array([116.56, 112.62, 113.81, ...,  85.51,  85.4 ,  85.52])

In [9]:
data = np.array([msft['close'].values[::-1],
                   ibm['close'].values[::-1],
                   qcom['close'].values[::-1]])
data

array([[116.56  , 112.62  , 113.81  , ...,  85.51  ,  85.4   ,  85.52  ],
       [116.    , 112.06  , 116.    , ..., 152.5   , 152.83  , 153.0385],
       [179.3   , 162.1   , 158.    , ...,  64.73  ,  64.3   ,  64.52  ]])

In [10]:
data = np.around(data) # 반올림 
data.shape #(3, 4526)

train_data = data[:, :3526] #[:][:3526]
test_data = data[:, 3526:]

# Building Environment
> - state: [(해당 주식 갯수, 해당 주식 1주당 가격), 계좌 잔액]
> - action: sell(0), hold(1), buy(2)
> - reward: 보상을 주는 policy는 다양하게 상황에 맞게 정의함 
>> - 1. action이후의 포트폴리오의 가치가 높으면 +1, 낮으면 -1 feedback 
>> - 2. action이후의 포트폴리오의 가치가 높으면 +1, 낮으면 -100 feedback
>> - 3. action이후의 포트폴리오의 가치의 차이만큼 feedback 

In [16]:
train_data.shape

(3, 3526)

In [25]:
new = train_data.max(axis = 1)
print(new.shape)

(3,)


In [26]:
print(new)

[117. 216. 179.]


In [None]:
class TradingEnv(gym.Env):
  def __init__(self, train_data, init_invest = 20000):
    self.stock_price_history = np.around(train_data)
    self.n_stock, self,n_step = self.stock_price_history.shape # 주식 종류 갯수, 각 요소의 갯수 

    self.init_invest = init_invest 
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None

    #define action space 
    self.action_space = spaces.Discrete(3**self.n_stock) # 액션의 갯수 정의 

    # 데이터 관찰 
    stock_max_price = self.stock_price_history.max(axis = 1) #(3, )
    stock_range = [[0, init_invest *2 // max] for mx in stock_max_price]
    price_range = [[0, mx] for mx in stock_max_price]
    cash_in_hand_range = [[0, init_invest * 2]]
    
    #define state space 
    self.observation_space = spaces.MultiDiscrete(stock_range+ price_range + cash_in_hand_range)

    self._seed()
    self._reset()

  def _seed(self, seed=None):
    self.np_random, seed = seeding.np_random(seed)
    return [seed]


  def _reset(self):
    self.cur_step = 0
    self.stock_owned = [0] * self.n_stock
    self.stock_price = self.stock_price_history[:, self.cur_step]
    self.cash_in_hand = self.init_invest
    return self._get_obs()


  def _step(self, action):
    assert self.action_space.contains(action)
    prev_val = self._get_val()
    self.cur_step += 1
    self.stock_price = self.stock_price_history[:, self.cur_step] # update price
    self._trade(action)
    cur_val = self._get_val()
    reward = cur_val - prev_val
    done = self.cur_step == self.n_step - 1
    info = {'cur_val': cur_val}
    return self._get_obs(), reward, done, info


  def _get_obs(self):
    obs = []
    obs.extend(self.stock_owned)
    obs.extend(list(self.stock_price))
    obs.append(self.cash_in_hand)
    return obs


  def _get_val(self):
    return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand


  def _trade(self, action):
    # all combo to sell(0), hold(1), or buy(2) stocks
    action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock))
    action_vec = action_combo[action]

    # one pass to get sell/buy index
    sell_index = []
    buy_index = []
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)

    # two passes: sell first, then buy; might be naive in real-world settings
    if sell_index:
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1 # buy one share
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False


In [28]:
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential 
from tensorflow.keras.optimizers import Adam

# modeling 
def mlp(n_obs, n_action, n_hidden_layer=1, n_neuron_per_layer=32,
        activation='relu', loss='mse'):
  """ A multi-layer perceptron """
  model = Sequential()
  model.add(Dense(n_neuron_per_layer, input_dim=n_obs, activation=activation))
  for _ in range(n_hidden_layer):
    model.add(Dense(n_neuron_per_layer, activation=activation))
  model.add(Dense(n_action, activation='linear'))
  model.compile(loss=loss, optimizer=Adam())
  print(model.summary())
  return model

In [29]:
import os 
from sklearn.preprocessing import StandardScaler # 평균 _, 분산_ 로 표준화 

def get_scaler(env):
  """ Takes a env and returns a scaler for its observation space """
  low = [0] * (env.n_stock * 2 + 1)

  high = []
  max_price = env.stock_price_history.max(axis=1)
  min_price = env.stock_price_history.min(axis=1)
  max_cash = env.init_invest * 3 # 3 is a magic number...
  max_stock_owned = max_cash // min_price
  for i in max_stock_owned:
    high.append(i)
  for i in max_price:
    high.append(i)
  high.append(max_cash)

  scaler = StandardScaler()
  scaler.fit([low, high]) # Compute the mean and std to be used for later scaling
  return scaler

In [None]:
from collections import deque 
import random 

# Deep-Q agent 
class DQNAgent(object):
  def __init__(self, state_size, action_size):
    self.state_size = state_size 
    self.action_size = action_size 
    self.memory = deque(maxlen = 2000)
    self.gamma = 0.95 #discount rate 
    self.epsilon = 1.0 
    self.epsilon_min = 0.01 
    self.epsilon_decay = 0.995 
    self.model = mlp(state_size, action_size)

  # 각 행동마다 state, action, reward, next_state,, 등의 정보 저장 
  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  #state에 따른 action 취해줌 
  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action

  # batch 사이즈에 해당할때마다 agent 학습(reinforcement learning)
  def replay(self, batch_size=32):
    """ vectorized implementation; 30x speed up compared with for loop """
    minibatch = random.sample(self.memory, batch_size)

    states = np.array([tup[0][0] for tup in minibatch])
    actions = np.array([tup[1] for tup in minibatch])
    rewards = np.array([tup[2] for tup in minibatch])
    next_states = np.array([tup[3][0] for tup in minibatch])
    done = np.array([tup[4] for tup in minibatch])

    # Q(s', a)
    target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)
    # end state target is reward itself (no lookahead)
    target[done] = rewards[done]

    # Q(s, a)
    target_f = self.model.predict(states) # 현재 state에 대해 model에서 예측한 state
    # make the agent to approximately map the current state to future discounted reward
    target_f[range(batch_size), actions] = target

    self.model.fit(states, target_f, epochs=1, verbose=0) # 현재 state와 예측값을 가지고 훈련 진행 

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def load(self, name):
    self.model.load_weights(name)


  def save(self, name):
    self.model.save_weights(name)


In [31]:
import gym 
from gym import spaces 
from gym.utils import seeding 
import itertools
import argparse

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--episode', type=int, default=2000,
                    help='number of episode to run')
parser.add_argument('-b', '--batch_size', type=int, default=32,
                    help='batch size for experience replay')
parser.add_argument('-i', '--initial_invest', type=int, default=20000,
                    help='initial investment amount')
parser.add_argument('-m', '--mode', type=str, required=True,
                    help='either "train" or "test"')
parser.add_argument('-w', '--weights', type=str, help='a trained model weights')
args = parser.parse_args()


if not os.path.exists('weights'):
    os.makedirs('weights')

if not os.path.exists('portfolio_val'):
    os.makedirs('portfolio_val')

In [None]:
## main ##
env = TradingEnv(train_data, args.initial_invest)
state_size = env.observation_space.shape 
action_size = env.action_space.shape 

agent = DQNAgent(state_size, action_size) #DQL의 agent 생성 

scaler = get_scaler(env) # 표준화 객체 생성 

portfolio_value = []

# test 모드일 경우 test_data 불러오기 및 env 생성 
if args.mode == 'test':
    # remake the env with test data
    env = TradingEnv(test_data, args.initial_invest)
    # load trained weights
    agent.load(args.weights)
    # when test, the timestamp is same as time when weights was trained
    timestamp = re.findall(r'\d{12}', args.weights)[0]

In [None]:
import pickle
# 에피소드마다 학습 반복 
for e in range(args.episode):
  state = env.reset()
  state = scaler.transform([state])
  for time in range(env.n_step):
    action = agent.act(state) # state에 따라 취해줄 action
    next_state, reward, done, info = env.step(action) # action 실행 
    next_state = scaler.transform([next_state]) # next_state 표준화 진행 
    if args.mode == 'train':
      agent.remember(state, action, reward, next_state, done)
    state = next_state # state 갱신 
    if done:
      print("episode: {}/{}, episode end value: {}".format(
        e + 1, args.episode, info['cur_val']))
      portfolio_value.append(info['cur_val']) # 에피소드가 끝날 때의 포트폴리오 가치를 기록합니다.
      break
    if args.mode == 'train' and len(agent.memory) > args.batch_size: #에피소드의 수가 배치사이즈만큼 도달했을 때 agent 강화학습 진행 
      agent.replay(args.batch_size)
  if args.mode == 'train' and (e + 1) % 10 == 0:  # weights를 중간중간에 저장합니다.
    agent.save('weights/{}-dqn.h5'.format(timestamp))

  # 포트폴리오 가치 변화를 저장합니다.
  with open('portfolio_val/{}-{}.p'.format(timestamp, args.mode), 'wb') as fp:
    pickle.dump(portfolio_value, fp)

[참고 깃허브](https://github.com/llSourcell/Q-Learning-for-Trading)