# Algorithm Implement

In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
time_period = 100
class Q_Network(nn.Module):
    '''
    The input of this network should have shape (num_frame, 80, 80)
    '''

    def __init__(self, num_frame, num_action, N, Vmin, Vmax):
        super(Q_Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=num_frame, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
        self.pool = nn.AvgPool2d(kernel_size=(2,1))
        self.fc1 = nn.Linear(480*12, 256)
        self.fc2 = nn.Linear(256, num_action*N)
        self.action_size = num_action
        self.N = N
        self.values = torch.linspace(Vmin, Vmax, N).view(1, 1, -1).to('cuda')

    def forward(self, image):
        x = F.relu(self.pool(self.conv1(image)))
        x = F.relu(self.pool(self.conv2(x)))
        x = F.relu(self.pool(self.conv3(x)))
        x = x.view(-1, 480*12)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.view(-1, self.action_size, self.N)
        log_probs = F.log_softmax(x, dim=2)  # (batch_size, action_size, N)
        Q_values = log_probs.exp() * self.values
        Q_values = Q_values.sum(dim=2, keepdims=False)
        return log_probs, Q_values

In [75]:
from torchsummary import summary

# Data Loading

In [48]:
pwd

'E:\\lab\\CP\\Reinforcement\\C51'

In [49]:
data = pd.read_csv('../../min_data_adjust.csv')

In [50]:
stock_data = data[data['symbol']=='AAPL']

In [51]:
stock_data['timestamp'][0]

'2022-01-03 09:00:00+00:00'

In [52]:
stock_df = stock_data[stock_data['timestamp']>'2023-08-31']

In [105]:
stock_df['time'] = pd.to_datetime(stock_df['timestamp'])

In [114]:
stock_df['time'] = stock_df['time'].dt.tz_convert('us/eastern').dt.strftime('%Y-%m-%d %H:%M:%S')

In [133]:
stock_df['hour'] = [x[11:] for x in stock_df['time']]

In [134]:
stock_df['hour']

326926    05:28:00
326927    05:36:00
326928    05:41:00
326929    05:43:00
326930    05:45:00
            ...   
343432    19:52:00
343433    19:53:00
343435    19:57:00
343436    19:58:00
343437    19:59:00
Name: hour, Length: 15149, dtype: object

In [138]:
idx = (stock_df['hour']>='09:29:59') & (stock_df['hour']<='16:00:00')

In [54]:
stock_df['pctchange'] = (stock_df['close'] - stock_df['open'])/stock_df['open']

In [140]:
stock_df = stock_df[idx]

# Technical Indicators

In [55]:
from finta import TA

In [56]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)
stock_df[['VW_MACD','MACD_SIGNAL']] = TA.VW_MACD(stock_df)
stock_df['RSI'] = TA.RSI(stock_df)
stock_df['CMO'] = TA.CMO(stock_df)

In [57]:
stock_df = stock_df.dropna()

In [58]:
stock_df.columns

Index(['symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume',
       'trade_count', 'vwap', 'pctchange', 'SMA42', 'SMA5', 'SMA15', 'AO',
       'OVB', 'VW_MACD', 'MACD_SIGNAL', 'RSI', 'CMO'],
      dtype='object')

In [149]:
stock_df_train = stock_df[stock_df['timestamp']<'2023-09-23']
stock_df_test = stock_df[stock_df['timestamp']>='2023-09-23']

# Replay Buffer

In [60]:
from collections import deque

test = deque(maxlen=5)
for i in range(10):
    test.append(i)
    print(test)

deque([0], maxlen=5)
deque([0, 1], maxlen=5)
deque([0, 1, 2], maxlen=5)
deque([0, 1, 2, 3], maxlen=5)
deque([0, 1, 2, 3, 4], maxlen=5)
deque([1, 2, 3, 4, 5], maxlen=5)
deque([2, 3, 4, 5, 6], maxlen=5)
deque([3, 4, 5, 6, 7], maxlen=5)
deque([4, 5, 6, 7, 8], maxlen=5)
deque([5, 6, 7, 8, 9], maxlen=5)


In [152]:
# from networks import *

import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np


class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, N, Vmin, Vmax, device, visual=False):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.N = N
        self.Vmin = Vmin
        self.Vmax = Vmax
        self.vals = torch.linspace(Vmin, Vmax, N).to(device)
        self.unit = (Vmax - Vmin) / (N - 1)

        self.Q_local = Q_Network(self.state_size, self.action_size, N, Vmin, Vmax).to(self.device)
        self.Q_target = Q_Network(self.state_size, self.action_size, N, Vmin, Vmax).to(self.device)

        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = deque(maxlen=100000)

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                _, action_values = self.Q_local(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        experiences = random.sample(self.memory, self.bs)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float().to(self.device)

        log_probs, _ = self.Q_local(states) #(batch_size, action_size, N)
        log_probs = torch.gather(input=log_probs, dim=1, index=actions.unsqueeze(1).repeat(1, 1, self.N)) #(batch_size, 1, N)

        with torch.no_grad():
            log_probs_targets, Q_targets = self.Q_target(next_states)
            _, actions_target = torch.max(input=Q_targets, dim=1, keepdim=True)#(batch_size, 1) the same size as actions
            log_probs_targets = torch.gather(input=log_probs_targets, dim=1, index=actions_target.unsqueeze(1).repeat(1, 1, self.N))
            # print(log_probs_targets.shape)
            target_distribution = self.update_distribution(log_probs_targets.exp(), rewards, dones) #(batch_size, 1, N)

        loss = -target_distribution*log_probs #D_KL(target||local)
        #loss = -log_probs.exp()*((target_distribution+1e-9).log() - log_probs) #D_KL(local||target)

        loss = loss.sum(dim=2, keepdims=False).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_distribution(self, old_distribution, reward, dones):
        with torch.no_grad():
            reward = reward.view(-1, 1)
            batch_size = reward.size(0)
            assert old_distribution.size(0) == batch_size
            new_vals = self.vals.view(1, -1) * self.gamma * (1-dones) + reward
            new_vals = torch.clamp(new_vals, self.Vmin, self.Vmax)
            lower = torch.floor((new_vals - self.Vmin) / self.unit).long().to(self.device)
            upper = torch.min(lower + 1, other=torch.tensor(self.N - 1)).to(self.device)
            lower_vals = self.vals[lower]
            lower_probs = 1 - torch.min((new_vals - lower_vals) / self.unit, other=torch.tensor(1, dtype=torch.float32)).to(self.device)
            transit = torch.zeros((batch_size, self.N, self.N)).to(self.device)
            first_dim = torch.tensor(range(batch_size), dtype=torch.long).view(-1, 1).repeat(1, self.N).view(-1).to(self.device)
            second_dim = torch.tensor(range(self.N), dtype=torch.long).repeat(batch_size).to(self.device)
            transit[first_dim, second_dim, lower.view(-1)] += lower_probs.view(-1)
            transit[first_dim, second_dim, upper.view(-1)] += 1 - lower_probs.view(-1)
            if len(old_distribution.size()) == 2:
                old_distribution = old_distribution.unsqueeze(1)
            return torch.bmm(old_distribution, transit)

    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [153]:
# indicators = ['open', 'high', 'low', 'close', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
#        'MACD_SIGNAL', 'RSI', 'CMO']

indicators = ['pctchange', 'volume', 'SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
       'MACD_SIGNAL', 'RSI', 'CMO']

In [154]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.data = data
        self.time = data.iloc[time_period]['timestamp']
        self.cost = cost
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.rowid = time_period
        self.action_space = np.array(list(range(11)))
    
    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.time = self.data.iloc[time_period]['timestamp']
        self.history=[]
        self.total_cost = 0    
        self.rowid = time_period
        return self.data[:time_period][indicators].values
    
    def step(self, action):
        done = False
        states = self.data.iloc[self.rowid]        
        self.rowid +=1
        if self.rowid == len(self.data)-1:
            done = True
        next_state = self.data.iloc[self.rowid]
        last_asset = self.asset
        price = next_state['open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['close']
        self.asset = market_value + self.cash
        reward = (self.asset - last_asset)/last_asset
        self.time = next_state['timestamp']
        # self.stock = stock
        return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [144]:
#env = gym.make()
env = Stock_Env(1000000, stock_df_train, 0.002)
env_test = Stock_Env(1000000, stock_df_test, 0.002)
num_episode = 5
max_t = 10000
reward_log = []

for _ in range(num_episode):
    
    # initialize
    env.reset()
    t = 0
    episodic_reward = 0
    
    for t in range(max_t):
        
        #env.render()
        action = np.random.randint(11) # random action
        _, reward, done = env.step(action)
        episodic_reward += reward
        if done:
            break
    
    reward_log.append(episodic_reward)

In [155]:
agent = Agent(1, len(env.action_space), 64, 0.001, 0.001, 0.99, 51, -0.1, 0.1, 'cuda', True)

In [156]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 20000
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
C = 4 # update weights every C steps

def validation(env, agent):
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    t = 0
    state = env.reset()
    while not done and t < max_t:
        t += 1
        action = agent.act(state, eps)
        frame, reward, done = env.step(action)
        next_state = frame
        state = next_state.copy()
        episodic_reward += reward
    return env.asset

def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    rewards_log = []
    average_log = []
    eps = eps_init

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward
        
        val_asset = validation(env_test, agent)

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}'.format(i, episodic_reward, average_log[-1], env.asset, val_asset), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [157]:
train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

Episode 100, Reward -0.738, Average Reward -0.732, Asset 477627.32, Validation Asset 496455.72
Episode 200, Reward -0.657, Average Reward -0.698, Asset 518099.84, Validation Asset 489554.80
Episode 300, Reward -0.451, Average Reward -0.578, Asset 636289.79, Validation Asset 475047.40
Episode 400, Reward -0.335, Average Reward -0.430, Asset 714805.95, Validation Asset 492694.65
Episode 500, Reward -0.272, Average Reward -0.300, Asset 761073.75, Validation Asset 484755.40
Episode 600, Reward -0.173, Average Reward -0.197, Asset 840976.69, Validation Asset 485016.78
Episode 700, Reward -0.093, Average Reward -0.110, Asset 910809.53, Validation Asset 491231.16
Episode 800, Reward -0.025, Average Reward -0.036, Asset 975146.44, Validation Asset 481760.98
Episode 900, Reward 0.027, Average Reward 0.014, Asset 1026784.62, Validation Asset 495051.703
Episode 1000, Reward 0.060, Average Reward 0.051, Asset 1061728.28, Validation Asset 486620.53
Episode 1100, Reward 0.081, Average Reward 0.081, 

KeyboardInterrupt: 

In [None]:
# eps_init = eps
# constant = C
# num_frame =1

# rewards_log = []
# average_log = []
# eps = eps_init

# for i in range(1, 1 + num_episode):
#     episodic_reward = 0
#     done = False
#     frame = env.reset()
#     state_deque = deque(maxlen=num_frame)
#     for _ in range(num_frame):
#         state_deque.append(frame)
#     state = np.stack(state_deque, axis=0)
#     state = np.expand_dims(state, axis=0)
#     t = 0

#     while not done and t < max_t:

#         t += 1
#         action = agent.act(state, eps)
#         frame, reward, done = env.step(action)
#         state_deque.append(frame)
#         next_state = np.stack(state_deque, axis=0)
#         next_state = np.expand_dims(next_state, axis=0)
#         agent.memory.append((state, action, reward, next_state, done))

#         if t % 5 == 0 and len(agent.memory) >= agent.bs:
#             agent.learn()
#             agent.soft_update(agent.tau)

#         state = next_state.copy()
#         episodic_reward += reward

#     rewards_log.append(episodic_reward)
#     average_log.append(np.mean(rewards_log[-100:]))
#     print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
#     if i % 100 == 0:
#         print()

#     eps = max(eps * eps_decay, eps_min)