# Algorithm Implement

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import quantstats as qs
time_period = 2
import warnings
warnings.filterwarnings('ignore')
from collections import deque

class Q_Network(nn.Module):

    def __init__(self, state_size, action_size, hidden=[64, 64]):
        super(Q_Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.fc3 = nn.Linear(hidden[1], action_size)

    def forward(self, state):
        x = state
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# class Q_Network(nn.Module):
#     '''
#     The input of this network should have shape (num_frame, 80, 80)
#     '''

#     def __init__(self, num_frame, num_action):
#         super(Q_Network, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels=num_frame, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
#         self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
#         self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
#         self.pool = nn.AvgPool2d(kernel_size=(2,1))
#         self.fc1 = nn.Linear(672, 256)
#         self.fc2 = nn.Linear(256, num_action)
#         self.sf = nn.Softmax()

#     def forward(self, image):
#         x = F.relu(self.pool(self.conv1(image)))
#         x = F.relu(self.pool(self.conv2(x)))
#         x = F.relu(self.pool(self.conv3(x)))
#         x = x.view(-1, 672)
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         x = self.sf(x)
#         return x

In [2]:
from torchsummary import summary

# Data Loading

In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv('../../test_data3.csv')
codes = data['symbol'].unique()
stock_df = data
stock_df = stock_df[['Date','symbol','Open','High','Low','Close','Volume','Dividends','Stock Splits','Pctchange', 'Neg','Neu','Pos']]
stock_df.columns = ['date', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'dividends','stock splits', 'pctchange','Neg', 'Neu', 'Pos']
stock_df['pctchange'] = (stock_df['close'] - stock_df['open'])/stock_df['open']

# Technical Indicators

In [4]:
from finta import TA

In [5]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)
stock_df[['VW_MACD','MACD_SIGNAL']] = TA.VW_MACD(stock_df)
stock_df['RSI'] = TA.RSI(stock_df)
stock_df['CMO'] = TA.CMO(stock_df)

In [6]:
stock_df = stock_df.dropna()

In [7]:
stock_df_train = stock_df[stock_df['date']<='2019-01-01'].groupby(['date','symbol']).agg('mean')
# stock_df_train = stock_df_train[stock_df_train['date']>='2023-01-01']
stock_df_test = stock_df[stock_df['date']>'2019-01-01']
stock_df_test = stock_df_test[stock_df_test['date']<='2019-12-31'].groupby(['date','symbol']).agg('mean')

train_date = sorted([x[0] for x in stock_df_train.index])
test_date = sorted([x[0] for x in stock_df_test.index])

In [8]:
# indicators = ['open', 'high', 'low', 'close', 'volume', 'positive', 'neutral', 'negative','SMA42', 'SMA5', 'SMA15', 'AO', 'OVB','VW_MACD',
#        'MACD_SIGNAL', 'RSI', 'CMO']

indicators = ['Neg','Neu','Pos']

# indicators = ['pctchange', 'volume', 'positive', 'neutral', 'negative']
# indicators = ['positive', 'neutral', 'negative']
# indicators = ['sentiment']

from tqdm import tqdm
def get_full_data(x, date):
        full_df = pd.DataFrame(0, index = codes, columns = x.columns)
        full_df.loc[set(full_df.index).intersection(set(x.index))] = x.loc[set(full_df.index).intersection(set(x.index))]
        v = full_df.values.reshape(1,-1)
        # full_df['date']=date
        return [date]+list(v[0])
    
dates = np.unique([x[0] for x in stock_df_train.index])
res = []
for date in tqdm(dates):
    x = stock_df_train[indicators].loc[date]
    res.append(get_full_data(x, date))

# res = pd.concat(res).reset_index()
# res.columns = ['tic', 'open', 'high', 'low', 'close', 'volume', 'positive',
#        'neutral', 'negative', 'pctchange', 'date']
stock_df_train_ = pd.DataFrame(res).set_index(0)

dates = np.unique([x[0] for x in stock_df_test.index])
res = []
for date in tqdm(dates):
    x = stock_df_test[indicators].loc[date]
    res.append(get_full_data(x, date))
    
# res = pd.concat(res).reset_index()
# res.columns = ['tic', 'open', 'high', 'low', 'close', 'volume', 'positive',
#        'neutral', 'negative', 'pctchange', 'date']
stock_df_test_ = pd.DataFrame(res).set_index(0)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

stock_df_train_1 = scaler.fit_transform(stock_df_train_)
stock_df_test_1 = scaler.transform(stock_df_test_)

stock_df_train_ = pd.DataFrame(stock_df_train_1, index = stock_df_train_.index, columns = stock_df_train_.columns)
stock_df_test_ = pd.DataFrame(stock_df_test_1, index = stock_df_test_.index, columns = stock_df_test_.columns)

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:00<00:00, 719.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 726.58it/s]


# RL

In [16]:
codes_dict = dict(zip(codes, range(len(codes))))

In [9]:
tst = None
import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np

# from networks import *

class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, device):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.Q_local = Q_Network(self.state_size, self.action_size).to(device)
        self.Q_target = Q_Network(self.state_size, self.action_size).to(device)
        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = deque(maxlen=100000)
        self.tst = None
        self.mu = [0]
        self.last_action = 0

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                action_values = self.Q_local(state).reshape(-1)
            if (action_values).max() > np.max(self.mu):
                # self.mu = 0.95*self.mu + 0.05*action_values.max()

                self.mu.append(action_values.max().cpu().data.numpy())                
                if len(self.mu) > 10:
                    self.mu = self.mu[-10:]
                self.last_action = np.argmax(action_values.cpu().data.numpy())
                return self.last_action
            else:
                return self.last_action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action
            return action

    def learn(self):
        experiences = random.sample(self.memory, self.bs)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float().to(self.device)
        self.tst = states
        Q_values = self.Q_local(states).reshape(-1,11)
        Q_values = torch.gather(input=Q_values, dim=-1, index=actions)
        with torch.no_grad():
            Q_targets = self.Q_target(next_states)
            Q_targets, _ = torch.max(input=Q_targets, dim=-1, keepdim=True)
            Q_targets = rewards + self.gamma * (1 - dones) * Q_targets

        loss = (Q_values - Q_targets).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [10]:
# class Stock_Env:
#     def __init__(self, initial_asset, data, cost):
#         self.asset = initial_asset
#         self.cash = initial_asset
#         self.stock = 0
#         self.data = data
#         self.time = data.iloc[time_period]['date']
#         self.cost = cost
#         self.history=[]
#         self.total_cost = 0
#         self.initial_asset = initial_asset
#         self.rowid = time_period
#         self.action_space = np.array(list(range(11)))
    
#     def reset(self):
#         self.asset = self.initial_asset
#         self.cash = self.initial_asset
#         self.stock = 0
#         self.time = self.data.iloc[100]['date']
#         self.history=[]
#         self.total_cost = 0    
#         self.rowid = time_period
#         return self.data[:time_period][indicators].values
    
#     def step(self, action):
#         done = False
#         states = self.data.iloc[self.rowid]        
#         self.rowid +=1
#         if self.rowid == len(self.data)-1:
#             done = True
#         next_state = self.data.iloc[self.rowid]
#         last_asset = self.asset
#         price = next_state['open']
#         old_asset = self.cash + self.stock*price
#         self.asset = old_asset
#         target_value = action*0.1*self.asset
#         distance = target_value - self.stock*price
#         stock_distance = int(distance/(price*(1+self.cost)))
#         self.stock += stock_distance
#         self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
#         self.asset = self.cash+self.stock*price
#         market_value = self.stock * next_state['close']
#         self.asset = market_value + self.cash
#         reward = self.asset - last_asset
#         self.time = next_state['date']
#         # self.stock = stock
#         return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [11]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost, time, record,train=True,market=False, code='AAPL'):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.stockvalue = 0
        self.data = data
        self.time = np.unique(time)
        self.cost = cost
        self.totalday = 0
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.timeid = time_period
        self.rowid = self.time[time_period]
        self.action_space = 11
        self.codeid = pd.DataFrame(range(len(codes)), index=codes)
        self.record = record
        self.train=train
        self.market=market
        self.code = code
    
    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.stockvalue = 0
        self.history=[]
        self.total_cost = 0
        if self.train:
            temp_time = np.random.randint(time_period, len(self.time)-252)
            self.rowid = self.time[temp_time]
            while (self.rowid, self.code) not in self.data.index:
                temp_time = np.random.randint(time_period, len(self.time)-252)
                self.rowid = self.time[temp_time]
            self.timeid = temp_time
            self.totalday = temp_time
        else:
            temp_time = time_period
            self.rowid = self.time[temp_time]
            self.timeid = temp_time
            self.totalday = temp_time
        self.totalday = temp_time
        temp = self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1],codes_dict[self.code]*3+1:codes_dict[self.code]*3+3].values.reshape(1,-1)
        # print(temp.shape, self.stockvalue.shape)
        return temp
        # for i in range(time_period):
        #     temp.append(list(self.get_full_data(self.data.loc[self.time[temp_time-time_period+i+1]]).values.reshape(-1)))       
        # return np.array(temp)
    
    def get_full_data(self,x):
        full_df = pd.DataFrame(0, index = self.codes, columns = x.columns)
        full_df.loc[set(full_df.index).intersection(set(x.index))] = x.loc[set(full_df.index).intersection(set(x.index))]
        return full_df
    
    
    def step(self, action):
        done = False
        # print(self.timeid, self.totalday)
        states = self.data.loc[self.rowid, self.code]   
        self.timeid +=1
        self.rowid = self.time[self.timeid]
        self.totalday += 1
        while (self.rowid, self.code) not in self.data.index:
            self.timeid +=1
            if (self.timeid != len(self.time)-1):
                self.rowid = self.time[self.timeid]
                self.totalday += 1
            else:
                return np.zeros(time_period*3), 0, True
        if (self.timeid == len(self.time)-1):
            done = True
        if (self.train==True) and (self.totalday>=251) :
            done = True
        next_state = self.data.loc[self.rowid, self.code]
        last_asset = self.asset
        price = next_state['open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['close']
        self.asset = market_value + self.cash
        reward = self.asset - last_asset
        reward = reward/last_asset
        # self.stock = stock
        # print(self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1]])
        return (self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1], codes_dict[self.code]*3+1:codes_dict[self.code]*3+3].values.reshape(1,-1), reward, done)

#     def step(self, action):
#         done = False
#         states = self.data.loc[self.rowid]        
#         self.timeid +=1
#         self.rowid = self.time[self.timeid]
#         if (self.timeid == len(self.time)-1):
#             done = True
#         if (self.train==True) and (self.totalday>=252) :
#             dont = True
#         self.totalday+=1
#         next_state = self.data.loc[self.rowid]
#         last_asset = self.asset
#         idx = self.codeid.loc[next_state.index].values.reshape(1,-1)
#         # Calculate the total assets at the beginning of the next day
#         self.stockvalue[idx] = self.stock[idx].reshape(1,-1)*next_state['open'].values.reshape(1,-1)
#         old_asset = self.cash + self.stockvalue.sum()
        
#         self.asset = old_asset
#         # Calculate the position for each stock and cash
#         action = np.exp(action)/np.exp(action).sum()
#         # Get the stock asset value, where the last value of action is the position of cash.
#         stockvalue_ = old_asset * (1-action[-1])
        
#         # Adjust the postion
#         target_value = action*old_asset
#         distance = target_value[:-1] - self.stockvalue
#         stock_distance = (distance[idx].reshape(-1))/((next_state['open'].values*(1+self.cost)).astype(int).reshape(-1))
#         # stock_distance /= 5
#         self.stock[idx] += stock_distance
#         self.cash = self.cash - distance[idx].sum() - np.abs(stock_distance*self.cost*next_state['open'].values).sum()
#         self.stockvalue[idx] = self.stock[idx] * next_state['close'].values
            
#         # Calculate new asset
#         self.asset = self.stockvalue.sum() + self.cash
        
#         reward = (self.asset - last_asset)/self.initial_asset
#         if self.market:
#             reward -= next_state['market'].values.mean()
        
#         # Generate new states
#         temp = self.record.loc[self.time[self.timeid+1-time_period:self.timeid+1]].values
#         temp = np.concatenate((temp, self.stockvalue/(self.asset)),axis=None)
#         # print(temp.shape)
#         # for i in range(time_period):
#         #     temp.append(list(self.get_full_data(self.data.loc[self.time[self.timeid-time_period+i+1]]).values.reshape(-1)))
#         return (temp, reward, done)
#         # return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [12]:
len(env_test.time)

NameError: name 'env_test' is not defined

In [None]:
# #env = gym.make()
# env = Stock_Env(1000000, stock_df_train, 0.002, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, code='AAPL')
# num_episode = 5
# max_t = 1000
# reward_log = []

# for _ in range(num_episode):
    
#     # initialize
#     env.reset()
#     t = 0
#     episodic_reward = 0
    
#     for t in range(max_t):
        
#         #env.render()
#         action = np.random.randint(11) # random action
#         _, reward, done = env.step(action)
#         episodic_reward += reward
#         if done:
#             break
    
#     reward_log.append(episodic_reward)

In [13]:
env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, train=True, code='META')
env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, train=False, code='META')
agent = Agent(2*3, 11, 64, 0.001, 0.001, 0.99, 'cuda')

In [14]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 1000
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
C = 4 # update weights every C steps

def validation(env, agent):
    # agent.mu=0
    env.mu=[0]
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    frame = env.reset()
    state = frame
    t = 0
    while not done and t < max_t:
        t += 1
        action = agent.act(state, eps)
        frame, reward, done = env.step(action)
        rewards_log.append(reward)
        episodic_reward += reward
    sharpe = qs.stats.sharpe(pd.DataFrame(rewards_log))
    return env.asset, episodic_reward, sharpe


def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    # global rewards_log, average_log, state_history, action_history, done_history, reward_history
    rewards_log = []
    average_log = []
    state_history = []
    action_history = []
    done_history = []
    reward_history = []
    validation_log = []
    validation_average_log = []
    sharpe_log = []
    average_sharpe = []
    eps = eps_init
    for i in range(1, 1 + num_episode):
        env.mu=[0]
        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        val_asset, val_reward, val_sharpe = validation(env_test, agent)

        validation_log.append(val_reward)
        validation_average_log.append(np.mean(validation_log[-100:]))
        sharpe_log.append(val_sharpe.values[0])
        average_sharpe.append(np.mean(sharpe_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, valReward {:.3f}, val Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}, Average Validation Sharpe {:.2f}'.format(i, episodic_reward, average_log[-1], val_reward, validation_average_log[-1], env.asset, val_asset, average_sharpe[-1]), end='')
        # print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [17]:
train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

Episode 100, Reward 0.149, Average Reward 0.067, valReward 0.020, val Average Reward 0.136, Asset 1159046.67, Validation Asset 1007133.83, Average Validation Sharpe 0.95
Episode 200, Reward 0.146, Average Reward 0.079, valReward 0.228, val Average Reward 0.152, Asset 1155355.00, Validation Asset 1239291.81, Average Validation Sharpe 1.06
Episode 300, Reward 0.162, Average Reward 0.074, valReward 0.069, val Average Reward 0.143, Asset 1172819.31, Validation Asset 1061096.15, Average Validation Sharpe 1.00
Episode 400, Reward 0.017, Average Reward 0.082, valReward -0.012, val Average Reward 0.160, Asset 1016732.79, Validation Asset 978388.28, Average Validation Sharpe 1.14
Episode 500, Reward 0.112, Average Reward 0.083, valReward 0.113, val Average Reward 0.140, Asset 1116405.10, Validation Asset 1106083.33, Average Validation Sharpe 1.00
Episode 600, Reward 0.053, Average Reward 0.082, valReward 0.075, val Average Reward 0.129, Asset 1054144.96, Validation Asset 1062301.37, Average Val

[0.06257656551460147,
 0.0582885995087275,
 0.0712353202545123,
 -0.001973518635573842,
 0.08786027652134903,
 0.05297677193611557,
 0.08620213518201852,
 0.06801009416020443,
 0.06877563576635294,
 0.1059738025315718,
 0.10031878924713047,
 -0.008551638852485183,
 0.04836795896808232,
 0.01323562596069566,
 -0.015578154378565603,
 0.13756432222805262,
 0.10572857452491076,
 0.040799524443656275,
 0.09971896683980616,
 0.0842040385710471,
 0.09169840334034533,
 0.0871222459631878,
 0.08413791929558533,
 0.07771636384057774,
 0.049213723703907704,
 0.008507699428157794,
 0.09129160716054831,
 0.13521602098317329,
 0.10973551318263998,
 0.07591058588049646,
 0.10722738562745496,
 0.04585893905649825,
 0.04200912122631058,
 0.07047641451991835,
 0.028199530088499437,
 0.09637984606391933,
 0.02449857505227989,
 0.04591047390438034,
 0.10093567663862477,
 0.07854220089823004,
 -0.006900440964379466,
 0.10269572321211011,
 0.09665563213722544,
 0.039087245510347625,
 0.11752935307814247,
 0

In [None]:
env_test.timeid

In [858]:
for code in codes:
    print(code, ' Begins')
    print('---------------------------------------------')
    env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, train=True, code=code)
    env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, train=False, code=code)
    agent = Agent(2*3, 11, 64, 0.001, 0.001, 0.99, 'cuda')
    train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

AAPL  Begins
---------------------------------------------
Episode 100, Reward 0.108, Average Reward 0.027, valReward 0.339, val Average Reward 0.273, Asset 1106570.61, Validation Asset 1392007.80, Average Validation Sharpe 1.97
Episode 200, Reward 0.117, Average Reward 0.041, valReward 0.354, val Average Reward 0.266, Asset 1118960.24, Validation Asset 1412565.14, Average Validation Sharpe 1.94
Episode 300, Reward 0.014, Average Reward 0.049, valReward 0.328, val Average Reward 0.279, Asset 1010468.78, Validation Asset 1374042.76, Average Validation Sharpe 2.04
Episode 400, Reward 0.149, Average Reward 0.055, valReward 0.302, val Average Reward 0.285, Asset 1154856.22, Validation Asset 1339423.05, Average Validation Sharpe 2.07
Episode 500, Reward -0.006, Average Reward 0.073, valReward 0.311, val Average Reward 0.290, Asset 994032.45, Validation Asset 1355198.59, Average Validation Sharpe 2.12
Episode 600, Reward 0.077, Average Reward 0.078, valReward 0.340, val Average Reward 0.274,

In [None]:
# eps_init = eps
# constant = C
# num_frame =1

# rewards_log = []
# average_log = []
# eps = eps_init

# for i in range(1, 1 + num_episode):
#     episodic_reward = 0
#     done = False
#     frame = env.reset()
#     state_deque = deque(maxlen=num_frame)
#     for _ in range(num_frame):
#         state_deque.append(frame)
#     state = np.stack(state_deque, axis=0)
#     state = np.expand_dims(state, axis=0)
#     t = 0

#     while not done and t < max_t:

#         t += 1
#         action = agent.act(state, eps)
#         frame, reward, done = env.step(action)
#         state_deque.append(frame)
#         next_state = np.stack(state_deque, axis=0)
#         next_state = np.expand_dims(next_state, axis=0)
#         agent.memory.append((state, action, reward, next_state, done))

#         if t % 5 == 0 and len(agent.memory) >= agent.bs:
#             agent.learn()
#             agent.soft_update(agent.tau)

#         state = next_state.copy()
#         episodic_reward += reward

#     rewards_log.append(episodic_reward)
#     average_log.append(np.mean(rewards_log[-100:]))
#     print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
#     if i % 100 == 0:
#         print()

#     eps = max(eps * eps_decay, eps_min)