# Algorithm Implement

In [117]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import quantstats as qs
time_period = 2
# class Q_Network(nn.Module):
#     '''
#     The input of this network should have shape (num_frame, 80, 80)
#     '''

#     def __init__(self, num_frame, num_action):
#         super(Q_Network, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels=num_frame, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
#         self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
#         self.pool = nn.AvgPool2d(kernel_size=(2,1))
#         self.fc1 = nn.Linear(544, 256)
#         self.fc2 = nn.Linear(256, num_action)
#         self.fc3 = nn.Linear(256, 1)

#     def forward(self, image):
#         x = F.relu(self.pool(self.conv1(image)))
#         x = F.relu(self.pool(self.conv2(x)))
#         x = F.relu(self.pool(self.conv3(x)))
#         x = x.view(-1, 544)
#         x = F.relu(self.fc1(x))
#         x1 = self.fc2(x)
#         x1 = x1 - torch.max(x1, dim=1, keepdim=True)[0]
#         x2 = self.fc3(x)
#         return x1 + x2

class Q_Network(nn.Module):

    def __init__(self, state_size, action_size, hidden=[64, 64], duel=False):
        super(Q_Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.fc3 = nn.Linear(hidden[1], action_size)
        self.duel = duel
        if self.duel:
            self.fc4 = nn.Linear(hidden[1], 1)

    def forward(self, state):
        x = state
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        if self.duel:
            x1 = self.fc3(x)
            x1 = x1 - torch.max(x1, dim=1, keepdim=True)[0] # set the max to be 0
            x2 = self.fc4(x)
            # print(x1.shape, x2.shape)
            return x1 + x2
        else:
            x = self.fc3(x)
            # print(x.shape)
            return x

In [49]:
from torchsummary import summary

# Data Loading

In [50]:
import pandas as pd
import numpy as np
data = pd.read_csv('../../test_data3.csv')
codes = data['symbol'].unique()
stock_df = data
stock_df = stock_df[['Date','symbol','Open','High','Low','Close','Volume','Dividends','Stock Splits','Pctchange', 'Neg','Neu','Pos']]
stock_df.columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'dividends','stock splits', 'pctchange','Neg', 'Neu', 'Pos']
stock_df['pctchange'] = (stock_df['close'] - stock_df['open'])/stock_df['open']

# Technical Indicators

In [51]:
from finta import TA

In [52]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)

In [53]:
stock_df = stock_df.dropna()

In [54]:
stock_df_train = stock_df[stock_df['date']<='2019-01-01'].groupby(['date','symbol']).agg('mean')
# stock_df_train = stock_df_train[stock_df_train['date']>='2023-01-01']
stock_df_test = stock_df[stock_df['date']>'2019-01-01']
stock_df_test = stock_df_test[stock_df_test['date']<='2019-12-31'].groupby(['date','symbol']).agg('mean')

train_date = sorted([x[0] for x in stock_df_train.index])
test_date = sorted([x[0] for x in stock_df_test.index])

# indicators = ['open', 'high', 'low', 'close', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
#        'MACD_SIGNAL', 'RSI', 'CMO']

indicators = ['Neg','Neu','Pos']

# indicators = ['pctchange', 'volume', 'positive', 'neutral', 'negative']
# indicators = ['positive', 'neutral', 'negative']
# indicators = ['sentiment']

from tqdm import tqdm
def get_full_data(x, date):
        full_df = pd.DataFrame(0, index = codes, columns = x.columns)
        full_df.loc[set(full_df.index).intersection(set(x.index))] = x.loc[set(full_df.index).intersection(set(x.index))]
        v = full_df.values.reshape(1,-1)
        # full_df['date']=date
        return [date]+list(v[0])
    
dates = np.unique([x[0] for x in stock_df_train.index])
res = []
for date in tqdm(dates):
    x = stock_df_train[indicators].loc[date]
    res.append(get_full_data(x, date))

# res = pd.concat(res).reset_index()
# res.columns = ['tic', 'open', 'high', 'low', 'close', 'volume', 'positive',
#        'neutral', 'negative', 'pctchange', 'date']
stock_df_train_ = pd.DataFrame(res).set_index(0)

dates = np.unique([x[0] for x in stock_df_test.index])
res = []
for date in tqdm(dates):
    x = stock_df_test[indicators].loc[date]
    res.append(get_full_data(x, date))
    
# res = pd.concat(res).reset_index()
# res.columns = ['tic', 'open', 'high', 'low', 'close', 'volume', 'positive',
#        'neutral', 'negative', 'pctchange', 'date']
stock_df_test_ = pd.DataFrame(res).set_index(0)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

stock_df_train_1 = scaler.fit_transform(stock_df_train_)
stock_df_test_1 = scaler.transform(stock_df_test_)

stock_df_train_ = pd.DataFrame(stock_df_train_1, index = stock_df_train_.index, columns = stock_df_train_.columns)
stock_df_test_ = pd.DataFrame(stock_df_test_1, index = stock_df_test_.index, columns = stock_df_test_.columns)

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:00<00:00, 762.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 780.13it/s]


# Replay Buffer

In [109]:
class SumTree:
    
    def __init__(self, capacity):
        
        self.capacity = capacity
        # the first capacity-1 positions are not leaves
        self.vals = [0 for _ in range(2*capacity - 1)] # think about why if you are not familiar with this
        
    def retrive(self, num):
        '''
        This function find the first index whose cumsum is no smaller than num
        '''
        ind = 0 # search from root
        while ind < self.capacity-1: # not a leaf
            left = 2*ind + 1
            right = left + 1
            if num > self.vals[left]: # the sum of the whole left tree is not large enouth
                num -= self.vals[left] # think about why?
                ind = right
            else: # search in the left tree
                ind = left
        return ind - self.capacity + 1
    
    def update(self, delta, ind):
        '''
        Change the value at ind by delta, and update the tree
        Notice that this ind should be the index in real memory part, instead of the ind in self.vals
        '''
        ind += self.capacity - 1
        while True:
            self.vals[ind] += delta
            if ind == 0:
                break
            ind -= 1
            ind //= 2

In [110]:
# from collections import deque

# test = deque(maxlen=5)
# for i in range(10):
#     test.append(i)
#     print(test)

import numpy as np
import random
import bisect
import torch

ALPHA = 0.5
EPSILON = 0.05
TD_INIT = 1

class Replay_Buffer:
    '''
    Vanilla replay buffer
    '''
    
    def __init__(self, capacity=int(1e6), batch_size=None):
        
        self.capacity = capacity
        self.memory = [None for _ in range(capacity)] # save tuples (state, action, reward, next_state, done)
        self.ind_max = 0 # how many transitions have been stored
        
    def remember(self, state, action, reward, next_state, done):
        
        ind = self.ind_max % self.capacity
        self.memory[ind] = (state, action, reward, next_state, done)
        self.ind_max += 1
        
    def sample(self, k):
        '''
        return sampled transitions. Make sure that there are at least k transitions stored before calling this method 
        '''
        index_set = random.sample(list(range(len(self))), k)
        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()
        
        return states, actions, rewards, next_states, dones
    
    def __len__(self):
        return min(self.ind_max, self.capacity)
        
class Rank_Replay_Buffer:
    '''
    Rank-based replay buffer
    '''
    
    def __init__(self, capacity=int(1e6), batch_size=64):
        self.capacity = capacity
        self.batch_size = batch_size
        self.alpha = ALPHA
        self.memory = [None for _ in range(capacity)]
        self.segments = [-1] + [None for _ in range(batch_size)] # the ith index will be in [segments[i-1]+1, segments[i]]
        
        self.errors = [] # saves (-TD_error, index of transition), sorted
        self.memory_to_rank = [None for _ in range(capacity)]
        
        self.ind_max = 0 # how many transitions have been stored
        self.total_weights = 0 # sum of p_i
        self.cumulated_weights = []
        
    def remember(self, state, action, reward, next_state, done):
        index = self.ind_max % self.capacity
        if self.ind_max >= self.capacity: # memory is full, need to pop
            self.pop(index)
        else: # memory is not full, need to adjust weights and find segment points
            self.total_weights += (1/(1+self.ind_max))**self.alpha # memory is not full, calculate new weights
            self.cumulated_weights.append(self.total_weights)
            self.update_segments()
        
        max_error = -self.errors[0][0] if self.errors else 0
        self.insert(max_error, index)
        self.memory[index] = (state, action, reward, next_state, done)
        self.ind_max += 1
        
    def sample(self, batch_size=None): # notive that batch_size is not used. It's just to unify the calling form
        index_set = [random.randint(self.segments[i]+1, self.segments[i+1]) for i in range(self.batch_size)]
        probs = torch.from_numpy(np.vstack([(1/(1+ind))**self.alpha/self.total_weights for ind in index_set])).float()
        
        index_set = [self.errors[ind][1] for ind in index_set]
        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()
        for ind in index_set:
            self.pop(ind)
        
        return index_set, states, actions, rewards, next_states, dones, probs
    
    def insert(self, error, index):
        '''
        Input : 
            error : the TD-error of this transition
            index : the location of this transition
        insert error into self.errors, update self.memory_to_rank and self.rank_to_memory accordingly
        '''
        ind = bisect.bisect(self.errors, (-error, index))
        self.memory_to_rank[index] = ind
        self.errors.insert(ind, (-error, index))
        for i in range(ind+1, len(self.errors)):
            self.memory_to_rank[self.errors[i][1]] += 1
        
    def pop(self, index):
        '''
        Input :
            index : the location of a transition
        remove this transition, update self.memory_to_rank and self.rank_to_memory accordingly
        '''
        ind = self.memory_to_rank[index]
        self.memory_to_rank[index] = None
        self.errors.pop(ind)
        for i in range(ind, len(self.errors)):
            self.memory_to_rank[self.errors[i][1]] -= 1
        
    def update_segments(self):
        '''
        Update the segment points.
        '''
        if self.ind_max+1 < self.batch_size: # if there is no enough transitions
            return None
        for i in range(self.batch_size):
            ind = bisect.bisect_left(self.cumulated_weights, self.total_weights*((i+1)/self.batch_size))
            self.segments[i+1] = max(ind, self.segments[i]+1)
            
    def __len__(self):
        return min(self.capacity, self.ind_max)
    

class Proportion_Replay_Buffer:
    '''
    Proportion-based replay buffer
    '''
    
    def __init__(self, capacity=int(1e6), batch_size=None):
        self.capacity = capacity
        self.alpha = ALPHA
        self.memory = [None for _ in range(capacity)]
        self.weights = SumTree(self.capacity)
        self.default = TD_INIT
        self.ind_max = 0
        
    def remember(self, state, action, reward, next_state, done):
        index = self.ind_max % self.capacity
        self.memory[index] = (state, action, reward, next_state, done)
        delta = self.default+EPSILON - self.weights.vals[index+self.capacity-1]
        self.weights.update(delta, index)
        self.ind_max += 1
        
    def sample(self, batch_size):
        index_set = [self.weights.retrive(self.weights.vals[0]*random.random()) for _ in range(batch_size)]
        #print(index_set)
        probs = torch.from_numpy(np.vstack([self.weights.vals[ind+self.capacity-1]/self.weights.vals[0] for ind in index_set])).float()                     
        
        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()

        return index_set, states, actions, rewards, next_states, dones, probs
                                 
    def insert(self, error, index):
        delta = error+EPSILON - self.weights.vals[index+self.capacity-1]
        self.weights.update(delta, index)
            
    def __len__(self):
        return min(self.capacity, self.ind_max)

In [111]:
tst = None
import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np

# from networks import *

class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, device, visual=False):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.Q_local = Q_Network(self.state_size, self.action_size,duel=False).to(device)
        self.Q_target = Q_Network(self.state_size, self.action_size,duel=False).to(device)
        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = Proportion_Replay_Buffer(int(1e5), bs)
        self.tst = None

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                action_values = self.Q_local(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        index_set, states, actions, rewards, next_states, dones, probs = self.memory.sample(self.bs)
        w = 1/len(self.memory)/probs
        w = w/torch.max(w)
        w = w.to(self.device)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        # print(states.shape)
        Q_values = self.Q_local(states)
        # print(actions.shape)
        Q_values = torch.gather(input=Q_values, dim=-1, index=actions)
        with torch.no_grad():
            Q_targets = self.Q_target(next_states)
            Q_targets, _ = torch.max(input=Q_targets, dim=-1, keepdim=True)
            Q_targets = rewards + self.gamma * (1 - dones) * Q_targets
    
        deltas = Q_values - Q_targets
        loss = (w*deltas.pow(2)).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        deltas = np.abs(deltas.detach().cpu().numpy().reshape(-1))
        for i in range(self.bs):
            self.memory.insert(deltas[i], index_set[i])
    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [113]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost, time, record,train=True,market=False, code='AAPL'):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.stockvalue = 0
        self.data = data
        self.time = np.unique(time)
        self.cost = cost
        self.totalday = 0
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.timeid = time_period
        self.rowid = self.time[time_period]
        self.action_space = 11
        self.codeid = pd.DataFrame(range(len(codes)), index=codes)
        self.record = record
        self.train=train
        self.market=market
        self.code = code
    
    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.stockvalue = 0
        self.history=[]
        self.total_cost = 0
        if self.train:
            temp_time = np.random.randint(time_period, len(self.time)-252)
            self.rowid = self.time[temp_time]
            while (self.rowid, self.code) not in self.data.index:
                temp_time = np.random.randint(time_period, len(self.time)-252)
                self.rowid = self.time[temp_time]
            self.timeid = temp_time
            self.totalday = temp_time
        else:
            temp_time = time_period
            self.rowid = self.time[temp_time]
            self.timeid = temp_time
            self.totalday = temp_time
        self.totalday = temp_time
        temp = self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1],codes_dict[self.code]*3+1:codes_dict[self.code]*3+3].values.reshape(1,-1)
        # print(temp.shape, self.stockvalue.shape)
        return temp
        # for i in range(time_period):
        #     temp.append(list(self.get_full_data(self.data.loc[self.time[temp_time-time_period+i+1]]).values.reshape(-1)))       
        # return np.array(temp)
    
    def get_full_data(self,x):
        full_df = pd.DataFrame(0, index = self.codes, columns = x.columns)
        full_df.loc[set(full_df.index).intersection(set(x.index))] = x.loc[set(full_df.index).intersection(set(x.index))]
        return full_df
    
    
    def step(self, action):
        done = False
        # print(self.timeid, self.totalday)
        states = self.data.loc[self.rowid, self.code]   
        self.timeid +=1
        self.rowid = self.time[self.timeid]
        self.totalday += 1
        while (self.rowid, self.code) not in self.data.index:
            self.timeid +=1
            if (self.timeid != len(self.time)-1):
                self.rowid = self.time[self.timeid]
                self.totalday += 1
            else:
                return np.zeros(time_period*3), 0, True
        if (self.timeid == len(self.time)-1):
            done = True
        if (self.train==True) and (self.totalday>=251) :
            done = True
        next_state = self.data.loc[self.rowid, self.code]
        last_asset = self.asset
        price = next_state['open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['close']
        self.asset = market_value + self.cash
        reward = self.asset - last_asset
        reward = reward/last_asset
        # self.stock = stock
        # print(self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1]])
        return (self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1], codes_dict[self.code]*3+1:codes_dict[self.code]*3+3].values.reshape(1,-1), reward, done)

#     def step(self, action):
#         done = False
#         states = self.data.loc[self.rowid]        
#         self.timeid +=1
#         self.rowid = self.time[self.timeid]
#         if (self.timeid == len(self.time)-1):
#             done = True
#         if (self.train==True) and (self.totalday>=252) :
#             dont = True
#         self.totalday+=1
#         next_state = self.data.loc[self.rowid]
#         last_asset = self.asset
#         idx = self.codeid.loc[next_state.index].values.reshape(1,-1)
#         # Calculate the total assets at the beginning of the next day
#         self.stockvalue[idx] = self.stock[idx].reshape(1,-1)*next_state['open'].values.reshape(1,-1)
#         old_asset = self.cash + self.stockvalue.sum()
        
#         self.asset = old_asset
#         # Calculate the position for each stock and cash
#         action = np.exp(action)/np.exp(action).sum()
#         # Get the stock asset value, where the last value of action is the position of cash.
#         stockvalue_ = old_asset * (1-action[-1])
        
#         # Adjust the postion
#         target_value = action*old_asset
#         distance = target_value[:-1] - self.stockvalue
#         stock_distance = (distance[idx].reshape(-1))/((next_state['open'].values*(1+self.cost)).astype(int).reshape(-1))
#         # stock_distance /= 5
#         self.stock[idx] += stock_distance
#         self.cash = self.cash - distance[idx].sum() - np.abs(stock_distance*self.cost*next_state['open'].values).sum()
#         self.stockvalue[idx] = self.stock[idx] * next_state['close'].values
            
#         # Calculate new asset
#         self.asset = self.stockvalue.sum() + self.cash
        
#         reward = (self.asset - last_asset)/self.initial_asset
#         if self.market:
#             reward -= next_state['market'].values.mean()
        
#         # Generate new states
#         temp = self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1]].values
#         temp = np.concatenate((temp, self.stockvalue/(self.asset)),axis=None)
#         # print(temp.shape)
#         # for i in range(time_period):
#         #     temp.append(list(self.get_full_data(self.data.loc[self.time[self.timeid-time_period+i+1]]).values.reshape(-1)))
#         return (temp, reward, done)
#         # return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [114]:
env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, train=True, code='META')
env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, train=False, code='META')
agent = Agent(2*3, 11, 64, 0.001, 0.001, 0.99, 'cuda', True)

In [122]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 500
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.995
eps_min = 0.01
C = 4 # update weights every C steps

def validation(env, agent):
    # agent.mu=0
    env.mu=[0]
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    frame = env.reset()
    state = frame.reshape(-1)
    t = 0
    while not done and t < max_t:
        t += 1
        action = agent.act(state, eps)
        frame, reward, done = env.step(action)
        rewards_log.append(reward)
        episodic_reward += reward
    sharpe = qs.stats.sharpe(pd.DataFrame(rewards_log))
    return env.asset, episodic_reward, sharpe


def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    rewards_log = []
    average_log = []
    state_history = []
    action_history = []
    done_history = []
    reward_history = []
    validation_log = []
    validation_average_log = []
    sharpe_log = []
    average_sharpe = []
    eps = eps_init

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset().reshape(-1)
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        # state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            frame = frame.reshape(-1)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            # next_state = np.expand_dims(next_state, axis=0)
            agent.memory.remember(state, action, reward, next_state, done)

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        val_asset, val_reward, val_sharpe = validation(env_test, agent)

        validation_log.append(val_reward)
        validation_average_log.append(np.mean(validation_log[-100:]))
        sharpe_log.append(val_sharpe.values[0])
        average_sharpe.append(np.mean(sharpe_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, valReward {:.3f}, val Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}, Average Validation Sharpe {:.2f}'.format(i, episodic_reward, average_log[-1], val_reward, validation_average_log[-1], env.asset, val_asset, average_sharpe[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [121]:
codes_dict = dict(zip(codes, range(len(codes))))
train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

Episode 100, Reward 0.082, Average Reward 0.101, valReward 0.030, val Average Reward 0.137, Asset 1084160.56, Validation Asset 1019392.06, Average Validation Sharpe 0.98
Episode 200, Reward 0.122, Average Reward 0.145, valReward 0.196, val Average Reward 0.126, Asset 1127154.06, Validation Asset 1207427.74, Average Validation Sharpe 0.90
Episode 300, Reward 0.129, Average Reward 0.179, valReward 0.283, val Average Reward 0.140, Asset 1136195.73, Validation Asset 1310643.77, Average Validation Sharpe 1.00
Episode 400, Reward 0.242, Average Reward 0.199, valReward 0.181, val Average Reward 0.150, Asset 1268522.25, Validation Asset 1182523.00, Average Validation Sharpe 1.06
Episode 435, Reward 0.246, Average Reward 0.199, valReward 0.191, val Average Reward 0.138, Asset 1275521.92, Validation Asset 1197626.99, Average Validation Sharpe 0.97

KeyboardInterrupt: 

In [123]:
for code in codes:
    print(code, ' Begins')
    print('---------------------------------------------')
    env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, train=True, code=code)
    env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, train=False, code=code)
    agent = Agent(2*3, 11, 64, 0.001, 0.001, 0.99, 'cuda', True)
    train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

AAPL  Begins
---------------------------------------------
Episode 100, Reward 0.129, Average Reward 0.032, valReward 0.297, val Average Reward 0.287, Asset 1132380.56, Validation Asset 1327628.71, Average Validation Sharpe 2.08
Episode 200, Reward 0.177, Average Reward 0.108, valReward 0.355, val Average Reward 0.280, Asset 1188433.05, Validation Asset 1411277.41, Average Validation Sharpe 2.00
Episode 300, Reward 0.159, Average Reward 0.154, valReward 0.376, val Average Reward 0.280, Asset 1167362.36, Validation Asset 1438848.95, Average Validation Sharpe 2.02
Episode 400, Reward 0.313, Average Reward 0.215, valReward 0.329, val Average Reward 0.276, Asset 1361017.75, Validation Asset 1375814.26, Average Validation Sharpe 2.01
Episode 500, Reward 0.342, Average Reward 0.220, valReward 0.324, val Average Reward 0.283, Asset 1399216.84, Validation Asset 1370050.60, Average Validation Sharpe 2.04
NFLX  Begins
---------------------------------------------
Episode 100, Reward 0.247, Avera