# Algorithm Implement

In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F
time_period = 15
class Q_Network(nn.Module):
    '''
    The input of this network should have shape (num_frame, 80, 80)
    '''

    def __init__(self, num_frame, num_action):
        super(Q_Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=num_frame, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
        self.pool = nn.AvgPool2d(kernel_size=(2,1))
        self.fc1 = nn.Linear(672, 256)
        self.fc2 = nn.Linear(256, num_action)
        self.sf = nn.Softmax()

    def forward(self, image):
        x = F.relu(self.pool(self.conv1(image)))
        x = F.relu(self.pool(self.conv2(x)))
        x = F.relu(self.pool(self.conv3(x)))
        x = x.view(-1, 672)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.sf(x)
        return x

In [29]:
from torchsummary import summary

# Data Loading

In [None]:
pwd

In [4]:
import pandas as pd
import pickle
file = open('../../FinBert/stock_data_full.bin', 'rb')
data = pickle.load(file)
file.close()

In [5]:
codes = ['AAPL','AMZN','C','GOOG','JPM','NFLX','PLTR']
for i in range(len(codes)):
    data[i]['symbol'] = codes[i]

In [6]:
df = pd.read_csv('../../FinRL/concat_data.csv')
df=df[['date', 'open', 'high', 'low', 'close', 'volume',
       'positive', 'neutral', 'negative', 'tic']]
df['date'] = [x[:10] for x in df['date']]
df = df[(df['date']>='2022-01-01') & (df['date']<'2023-09-30')]

In [7]:
data = pd.read_csv('../../min_data_adjust.csv')

In [8]:
stock_data = data[data['symbol']=='AAPL']

In [9]:
stock_data

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2022-01-03 09:00:00+00:00,176.23,176.23,176.1800,176.1800,1118.0,65.0,176.210000
1,AAPL,2022-01-03 09:02:00+00:00,176.30,176.31,176.2800,176.2800,1218.0,26.0,176.300000
2,AAPL,2022-01-03 09:03:00+00:00,176.25,176.27,176.2500,176.2700,814.0,30.0,176.260000
3,AAPL,2022-01-03 09:04:00+00:00,176.20,176.20,176.1200,176.1200,3744.0,114.0,176.180000
4,AAPL,2022-01-03 09:05:00+00:00,176.17,176.17,176.1700,176.1700,464.0,33.0,176.150000
...,...,...,...,...,...,...,...,...,...
343433,AAPL,2023-09-29 23:53:00+00:00,171.30,171.30,171.3000,171.3000,209.0,8.0,171.305766
343434,AAPL,2023-09-29 23:54:00+00:00,171.30,171.30,171.3000,171.3000,810.0,16.0,171.305889
343435,AAPL,2023-09-29 23:57:00+00:00,171.32,171.32,171.3200,171.3200,439.0,20.0,171.330957
343436,AAPL,2023-09-29 23:58:00+00:00,171.30,171.30,171.2699,171.2699,532.0,11.0,171.282998


In [44]:
stock_df = df[df['tic']=='AAPL']

In [45]:
stock_df

Unnamed: 0,date,open,high,low,close,volume,positive,neutral,negative,tic
17,2022-01-03,177.830002,182.880005,177.710007,182.009995,104487900,-2.525743,3.722111,-3.922445,AAPL
21,2022-01-04,182.630005,182.940002,179.119995,179.699997,99310400,-2.752612,3.370780,-3.351379,AAPL
26,2022-01-05,179.610001,180.169998,174.639999,174.919998,94537600,-2.561095,3.561730,-3.588621,AAPL
35,2022-01-06,172.699997,175.300003,171.639999,172.000000,96904000,-2.294448,3.207229,-3.612424,AAPL
42,2022-01-07,172.889999,174.139999,171.029999,172.169998,86709100,-2.325235,3.084295,-3.352122,AAPL
...,...,...,...,...,...,...,...,...,...,...
3028,2023-09-25,174.199997,176.970001,174.149994,176.080002,46172700,-2.361765,3.181928,-3.037302,AAPL
3034,2023-09-26,174.820007,175.199997,171.660004,171.960007,64588900,-1.893191,2.688069,-3.369864,AAPL
3045,2023-09-27,172.619995,173.039993,169.050003,170.429993,66921800,-3.139558,3.359877,-2.654129,AAPL
3049,2023-09-28,169.339996,172.029999,167.619995,170.690002,56294400,-2.045589,2.791628,-3.063628,AAPL


# Technical Indicators

In [46]:
from finta import TA

In [47]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)
stock_df[['VW_MACD','MACD_SIGNAL']] = TA.VW_MACD(stock_df)
stock_df['RSI'] = TA.RSI(stock_df)
stock_df['CMO'] = TA.CMO(stock_df)

In [48]:
stock_df = stock_df.dropna()

In [49]:
stock_df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'positive', 'neutral',
       'negative', 'tic', 'SMA42', 'SMA5', 'SMA15', 'AO', 'OVB', 'VW_MACD',
       'MACD_SIGNAL', 'RSI', 'CMO'],
      dtype='object')

# Replay Buffer

In [50]:
from collections import deque

test = deque(maxlen=5)
for i in range(10):
    test.append(i)
    print(test)

deque([0], maxlen=5)
deque([0, 1], maxlen=5)
deque([0, 1, 2], maxlen=5)
deque([0, 1, 2, 3], maxlen=5)
deque([0, 1, 2, 3, 4], maxlen=5)
deque([1, 2, 3, 4, 5], maxlen=5)
deque([2, 3, 4, 5, 6], maxlen=5)
deque([3, 4, 5, 6, 7], maxlen=5)
deque([4, 5, 6, 7, 8], maxlen=5)
deque([5, 6, 7, 8, 9], maxlen=5)


In [59]:
tst = None
import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np

# from networks import *

class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, device, visual=False):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.Q_local = Q_Network(self.state_size, self.action_size).to(device)
        self.Q_target = Q_Network(self.state_size, self.action_size).to(device)
        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = deque(maxlen=100000)
        self.tst = None

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                action_values = self.Q_local(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        experiences = random.sample(self.memory, self.bs)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float().to(self.device)
        self.tst = states
        Q_values = self.Q_local(states)
        Q_values = torch.gather(input=Q_values, dim=-1, index=actions)
        with torch.no_grad():
            Q_targets = self.Q_target(next_states)
            Q_targets, _ = torch.max(input=Q_targets, dim=-1, keepdim=True)
            Q_targets = rewards + self.gamma * (1 - dones) * Q_targets

        loss = (Q_values - Q_targets).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [60]:
indicators = ['open', 'high', 'low', 'close', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
       'MACD_SIGNAL', 'RSI', 'CMO']

In [61]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.data = data
        self.time = data.iloc[time_period]['date']
        self.cost = cost
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.rowid = time_period
        self.action_space = np.array(list(range(11)))
    
    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.time = self.data.iloc[100]['date']
        self.history=[]
        self.total_cost = 0    
        self.rowid = time_period
        return self.data[:time_period][indicators].values
    
    def step(self, action):
        done = False
        states = self.data.iloc[self.rowid]        
        self.rowid +=1
        if self.rowid == len(self.data)-1:
            done = True
        next_state = self.data.iloc[self.rowid]
        last_asset = self.asset
        price = next_state['open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['close']
        self.asset = market_value + self.cash
        reward = self.asset - last_asset
        self.time = next_state['date']
        # self.stock = stock
        return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [62]:
#env = gym.make()
env = Stock_Env(1000000, stock_df, 0.002)
num_episode = 5
max_t = 1000
reward_log = []

for _ in range(num_episode):
    
    # initialize
    env.reset()
    t = 0
    episodic_reward = 0
    
    for t in range(max_t):
        
        #env.render()
        action = np.random.randint(11) # random action
        _, reward, done = env.step(action)
        episodic_reward += reward
        if done:
            break
    
    reward_log.append(episodic_reward)

In [63]:
agent = Agent(1, len(env.action_space), 64, 0.001, 0.001, 0.99, 'cuda',True)

In [68]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 20000
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
C = 4 # update weights every C steps

def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    rewards_log = []
    average_log = []
    eps = eps_init

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [None]:
train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

Episode 100, Reward -158293.017, Average Reward -214143.315
Episode 200, Reward -219081.606, Average Reward -194950.177
Episode 300, Reward -171535.242, Average Reward -153084.563
Episode 400, Reward -7701.033, Average Reward -122730.15938
Episode 500, Reward -63621.570, Average Reward -100781.8433
Episode 600, Reward -43233.783, Average Reward -72651.46751
Episode 700, Reward -31046.109, Average Reward -57316.6849
Episode 800, Reward 56621.490, Average Reward -42087.85283
Episode 900, Reward -79004.737, Average Reward -31984.3725
Episode 1000, Reward -22175.508, Average Reward -21836.425
Episode 1100, Reward 20945.377, Average Reward -17013.1324
Episode 1200, Reward -20187.514, Average Reward -12304.824
Episode 1300, Reward -5267.051, Average Reward -11603.9994
Episode 1400, Reward -6705.901, Average Reward -6337.27919
Episode 1500, Reward 1859.653, Average Reward -4635.49832
Episode 1600, Reward 3272.370, Average Reward -2534.43855
Episode 1700, Reward 22179.980, Average Reward -1342

In [None]:
# eps_init = eps
# constant = C
# num_frame =1

# rewards_log = []
# average_log = []
# eps = eps_init

# for i in range(1, 1 + num_episode):
#     episodic_reward = 0
#     done = False
#     frame = env.reset()
#     state_deque = deque(maxlen=num_frame)
#     for _ in range(num_frame):
#         state_deque.append(frame)
#     state = np.stack(state_deque, axis=0)
#     state = np.expand_dims(state, axis=0)
#     t = 0

#     while not done and t < max_t:

#         t += 1
#         action = agent.act(state, eps)
#         frame, reward, done = env.step(action)
#         state_deque.append(frame)
#         next_state = np.stack(state_deque, axis=0)
#         next_state = np.expand_dims(next_state, axis=0)
#         agent.memory.append((state, action, reward, next_state, done))

#         if t % 5 == 0 and len(agent.memory) >= agent.bs:
#             agent.learn()
#             agent.soft_update(agent.tau)

#         state = next_state.copy()
#         episodic_reward += reward

#     rewards_log.append(episodic_reward)
#     average_log.append(np.mean(rewards_log[-100:]))
#     print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
#     if i % 100 == 0:
#         print()

#     eps = max(eps * eps_decay, eps_min)