# Algorithm Implement

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
time_period = 15
class Actor_Critic(nn.Module):

    def __init__(self, state_size, action_size, hidden=[128, 16]):
        super(Actor_Critic, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=state_size, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
        self.pool = nn.AvgPool2d(kernel_size=(2,1))
        self.fc1 = nn.Linear(576, 256)
        self.fc2 = nn.Linear(256, action_size)
        self.fc3 = nn.Linear(256,1)

    def forward(self, state):
        x = F.relu(self.pool(self.conv1(state)))
        x = F.relu(self.pool(self.conv2(x)))
        x = F.relu(self.pool(self.conv3(x)))
        x = x.view(-1, 576)
        x = F.relu(self.fc1(x))
        log_probs = F.log_softmax(self.fc2(x), dim=1)
        values = self.fc3(x)
        return log_probs, values

In [None]:
from torchsummary import summary

# Data Loading

In [None]:
pwd

'E:\\lab\\CP\\Reinforcement\\Basic Actor-Critic'

In [2]:
import pandas as pd
import pickle
file = open('../../FinBert/stock_data_full.bin', 'rb')
data = pickle.load(file)
file.close()

In [3]:
codes = ['AAPL','AMZN','C','GOOG','JPM','NFLX','PLTR']
for i in range(len(codes)):
    data[i]['symbol'] = codes[i]

In [4]:
df = pd.read_csv('../../FinRL/concat_data.csv')
df=df[['date', 'open', 'high', 'low', 'close', 'volume',
       'positive', 'neutral', 'negative', 'tic']]
df['date'] = [x[:10] for x in df['date']]
df = df[(df['date']>='2022-01-01') & (df['date']<'2023-09-30')]

In [5]:
data = pd.read_csv('../../min_data_adjust.csv')

In [6]:
stock_data = data[data['symbol']=='AAPL']

In [7]:
stock_data

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2022-01-03 09:00:00+00:00,176.23,176.23,176.1800,176.1800,1118.0,65.0,176.210000
1,AAPL,2022-01-03 09:02:00+00:00,176.30,176.31,176.2800,176.2800,1218.0,26.0,176.300000
2,AAPL,2022-01-03 09:03:00+00:00,176.25,176.27,176.2500,176.2700,814.0,30.0,176.260000
3,AAPL,2022-01-03 09:04:00+00:00,176.20,176.20,176.1200,176.1200,3744.0,114.0,176.180000
4,AAPL,2022-01-03 09:05:00+00:00,176.17,176.17,176.1700,176.1700,464.0,33.0,176.150000
...,...,...,...,...,...,...,...,...,...
343433,AAPL,2023-09-29 23:53:00+00:00,171.30,171.30,171.3000,171.3000,209.0,8.0,171.305766
343434,AAPL,2023-09-29 23:54:00+00:00,171.30,171.30,171.3000,171.3000,810.0,16.0,171.305889
343435,AAPL,2023-09-29 23:57:00+00:00,171.32,171.32,171.3200,171.3200,439.0,20.0,171.330957
343436,AAPL,2023-09-29 23:58:00+00:00,171.30,171.30,171.2699,171.2699,532.0,11.0,171.282998


In [8]:
stock_df = df[df['tic']=='AAPL']

In [9]:
stock_df

Unnamed: 0,date,open,high,low,close,volume,positive,neutral,negative,tic
17,2022-01-03,177.830002,182.880005,177.710007,182.009995,104487900,-2.525743,3.722111,-3.922445,AAPL
21,2022-01-04,182.630005,182.940002,179.119995,179.699997,99310400,-2.752612,3.370780,-3.351379,AAPL
26,2022-01-05,179.610001,180.169998,174.639999,174.919998,94537600,-2.561095,3.561730,-3.588621,AAPL
35,2022-01-06,172.699997,175.300003,171.639999,172.000000,96904000,-2.294448,3.207229,-3.612424,AAPL
42,2022-01-07,172.889999,174.139999,171.029999,172.169998,86709100,-2.325235,3.084295,-3.352122,AAPL
...,...,...,...,...,...,...,...,...,...,...
3028,2023-09-25,174.199997,176.970001,174.149994,176.080002,46172700,-2.361765,3.181928,-3.037302,AAPL
3034,2023-09-26,174.820007,175.199997,171.660004,171.960007,64588900,-1.893191,2.688069,-3.369864,AAPL
3045,2023-09-27,172.619995,173.039993,169.050003,170.429993,66921800,-3.139558,3.359877,-2.654129,AAPL
3049,2023-09-28,169.339996,172.029999,167.619995,170.690002,56294400,-2.045589,2.791628,-3.063628,AAPL


In [10]:
stock_df['pctchange'] = (stock_df['close'] - stock_df['open'])/stock_df['open']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['pctchange'] = (stock_df['close'] - stock_df['open'])/stock_df['open']


# Technical Indicators

In [11]:
from finta import TA

In [12]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)
stock_df[['VW_MACD','MACD_SIGNAL']] = TA.VW_MACD(stock_df)
stock_df['RSI'] = TA.RSI(stock_df)
stock_df['CMO'] = TA.CMO(stock_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA42'] = TA.SMA(stock_df, 42)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA5'] = TA.SMA(stock_df, 5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA15'] = TA.SMA(stock_df, 15)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

In [13]:
stock_df = stock_df.dropna()

In [14]:
stock_df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'positive', 'neutral',
       'negative', 'tic', 'pctchange', 'SMA42', 'SMA5', 'SMA15', 'AO', 'OVB',
       'VW_MACD', 'MACD_SIGNAL', 'RSI', 'CMO'],
      dtype='object')

In [15]:
stock_df_train = stock_df[stock_df['date']<='2023-03-31']
stock_df_test = stock_df[stock_df['date']>'2023-03-31']

# RL

In [17]:
import torch
import torch.optim as optim
import numpy as np

import math

class Agent:
    
    def __init__(self, state_size, action_size, lr, gamma, device, mode='MC', use_critic=False, normalize=False):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        self.device = device
        self.mode = mode
        self.use_critic = use_critic
        self.normalize = normalize
        

        self.Actor_Critic = Actor_Critic(self.state_size, self.action_size).to(self.device)
        self.optimizer = optim.Adam(self.Actor_Critic.parameters(), lr)
            
    def act(self, states):
        with torch.no_grad():
            states = torch.tensor(states.astype(np.float32)).to(self.device)

            log_probs, _ = self.Actor_Critic(states)
            probs = log_probs.exp().view(-1).cpu().numpy()
            action = np.random.choice(a=self.action_size, size=1, replace=False, p=probs)[0]
        return action
    
    def process_data(self, states, actions, rewards, dones, batch_size):
        
        states = torch.tensor(states, dtype=torch.float).to(self.device)
        states = states.reshape([states.shape[0]]+list(states.shape[2:]))
        actions = torch.tensor(actions, dtype=torch.long).to(self.device).view(-1, 1)
        dones = torch.tensor(dones, dtype=torch.float).to(self.device).view(-1,1)
        #calculate log probabilities and state values
        N = states.size(0) # N-1 is the length of actions, rewards and dones
        log_probs = torch.zeros((N, self.action_size)).to(self.device)
        state_values = torch.zeros((N, 1)).to(self.device)
        step = math.ceil(N/batch_size)
        
        for ind in range(step):
            output1, output2 = self.Actor_Critic(states[ind*batch_size:(ind+1)*batch_size, :])

            log_probs[ind*batch_size:(ind+1)*batch_size, :] = output1
            state_values[ind*batch_size:(ind+1)*batch_size, :] = output2 
        
        log_probs = log_probs[:-1, :]# remove the last one, which corresponds to no actions
        log_probs = torch.gather(log_probs, dim=1, index=actions)
        
        #calculate discounted rewards, gamma^t r_t
        L = len(rewards)
        rewards = np.array(rewards) #r_t
        discounts = self.gamma ** np.arange(L)
        discounted_rewards = rewards * discounts # this is gamma^t r_t
        
        return state_values, log_probs, rewards, discounted_rewards, dones
    
    def learn(self, state_values, log_probs, rewards, discounted_rewards, dones):

        # Update Critic use MSE
        # Update Actor by maximizing A_t * log(a_t|s_t)

        L = len(discounted_rewards)
        with torch.no_grad():
            G = []
            return_value = 0
            if self.mode == 'MC':
                for i in range(L-1, -1, -1):
                    return_value = rewards[i] + self.gamma * (1-dones[i].cpu().detach().numpy()) * return_value
                    G.append(return_value)
                G = G[::-1]
                G = torch.tensor(G, dtype=torch.float).view(-1, 1).to(self.device)
            else:
                rewards = torch.tensor(rewards, dtype=torch.float).view(-1, 1).to(self.device)
                G = rewards + self.gamma * (1-dones) * state_values[1:, :]
            
        Critic_Loss = 0.5*(state_values[:-1, :] - G).pow(2).mean()
        
        with torch.no_grad():
            if self.use_critic:
                G = G - state_values[:-1, :] # advantage
            if self.normalize:
                G = (G - G.mean()) / (G.std() + 0.00001) # normalized advantage
                
        Actor_Loss = -log_probs * G
        Actor_Loss = Actor_Loss.mean()
        

        Loss = Actor_Loss + Critic_Loss
        self.optimizer.zero_grad()
        Loss.backward()
        self.optimizer.step()


In [18]:
# indicators = ['open', 'high', 'low', 'close', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
#        'MACD_SIGNAL', 'RSI', 'CMO']

indicators = ['pctchange', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
       'MACD_SIGNAL', 'RSI', 'CMO']

In [63]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.data = data
        self.time = data.iloc[time_period]['date']
        self.cost = cost
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.rowid = time_period
        self.action_space = np.array(list(range(11)))
    
    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.time = self.data.iloc[100]['date']
        self.history=[]
        self.total_cost = 0    
        temp_time = np.random.randint(15,100)
        self.rowid = temp_time
        return self.data[temp_time-time_period:temp_time][indicators].values
    
    def step(self, action):
        done = False
        states = self.data.iloc[self.rowid]        
        self.rowid +=1
        if self.rowid == len(self.data)-1:
            done = True
        next_state = self.data.iloc[self.rowid]
        last_asset = self.asset
        price = next_state['open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['close']
        self.asset = market_value + self.cash
        reward = (self.asset - last_asset)/last_asset
        self.time = next_state['date']
        # self.stock = stock
        return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [64]:
#env = gym.make()
env = Stock_Env(1000000, stock_df_train, 0.002)
env_test = Stock_Env(1000000, stock_df_test, 0.002)
num_episode = 5
max_t = 1000
reward_log = []

for _ in range(num_episode):
    
    # initialize
    env.reset()
    t = 0
    episodic_reward = 0
    
    for t in range(max_t):
        
        #env.render()
        action = np.random.randint(11) # random action
        _, reward, done = env.step(action)
        episodic_reward += reward
        if done:
            break
    
    reward_log.append(episodic_reward)

In [68]:
agent = Agent(1, len(env.action_space), 0.0005, 0.99, 'cuda', 'TD', True, False)

In [69]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 20000
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
num_frame = 1
C = 4 # update weights every C steps
state_deque = deque(maxlen=num_frame)

def validation(env, agent):
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    t = 0
    state = env.reset()
    while not done and t < max_t:
        t += 1
        action = agent.act(state.reshape([1]+list(state.shape)))
        frame, reward, done = env.step(action)
        next_state = frame
        state = next_state.copy()
        episodic_reward += reward
    return env.asset

def train(agent, env, n_episode, max_t, scale=1):
    rewards_log = []
    average_log = []

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset()
        # state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        
        done = False
        t = 0
        state_history = [list(state)]
        action_history = []
        done_history = []
        reward_history = []
        episodic_reward = 0

        while not done and t < max_t:

            
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
           
            episodic_reward += reward
            action_history.append(action)
            done_history.append(done)
            reward_history.append(reward * scale)
            state = next_state.copy()
            state_history.append(state)
            t += 1

        state_values, log_probs, rewards, discounted_rewards, dones = agent.process_data(state_history, action_history, reward_history, done_history, 64)
        agent.learn(state_values, log_probs, rewards, discounted_rewards, dones)
        
        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        
        val_asset = validation(env_test, agent)

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}'.format(i, episodic_reward, average_log[-1], env.asset, val_asset), end='')
        if i % 100 == 0:
            print()
            
    return rewards, average_log

In [70]:
rewards_log, _ = train(agent, env, 2000, max_t, 10)

Episode 100, Reward 0.033, Average Reward 0.025, Asset 1032096.49, Validation Asset 918107.65
Episode 200, Reward 0.040, Average Reward 0.020, Asset 1038519.14, Validation Asset 1004900.58
Episode 300, Reward 0.035, Average Reward 0.024, Asset 1033699.81, Validation Asset 895081.850
Episode 400, Reward 0.022, Average Reward 0.023, Asset 1020592.28, Validation Asset 881570.747
Episode 500, Reward 0.043, Average Reward 0.024, Asset 1041622.11, Validation Asset 1029258.18
Episode 600, Reward 0.025, Average Reward 0.022, Asset 1023800.26, Validation Asset 990699.000
Episode 700, Reward 0.001, Average Reward 0.022, Asset 998219.82, Validation Asset 993288.3942
Episode 800, Reward 0.037, Average Reward 0.021, Asset 1036133.66, Validation Asset 1008594.27
Episode 900, Reward 0.036, Average Reward 0.026, Asset 1034510.83, Validation Asset 978153.640
Episode 1000, Reward 0.043, Average Reward 0.029, Asset 1041622.11, Validation Asset 998751.80
Episode 1100, Reward 0.016, Average Reward 0.024, A

In [None]:
# eps_init = eps
# constant = C
# num_frame =1

# rewards_log = []
# average_log = []
# eps = eps_init

# for i in range(1, 1 + num_episode):
#     episodic_reward = 0
#     done = False
#     frame = env.reset()
#     state_deque = deque(maxlen=num_frame)
#     for _ in range(num_frame):
#         state_deque.append(frame)
#     state = np.stack(state_deque, axis=0)
#     state = np.expand_dims(state, axis=0)
#     t = 0

#     while not done and t < max_t:

#         t += 1
#         action = agent.act(state, eps)
#         frame, reward, done = env.step(action)
#         state_deque.append(frame)
#         next_state = np.stack(state_deque, axis=0)
#         next_state = np.expand_dims(next_state, axis=0)
#         agent.memory.append((state, action, reward, next_state, done))

#         if t % 5 == 0 and len(agent.memory) >= agent.bs:
#             agent.learn()
#             agent.soft_update(agent.tau)

#         state = next_state.copy()
#         episodic_reward += reward

#     rewards_log.append(episodic_reward)
#     average_log.append(np.mean(rewards_log[-100:]))
#     print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
#     if i % 100 == 0:
#         print()

#     eps = max(eps * eps_decay, eps_min)