# Algorithm Implement

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import quantstats as qs
import sys
time_period = 2
sys.path.append('./')
from utlis import get_data, Stock_Env
class Q_Network(nn.Module):

    def __init__(self, state_size, action_size, N, Vmin, Vmax, hidden=[64, 64]):
        super(Q_Network, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], hidden[1])
        self.fc3 = nn.Linear(hidden[1], action_size*N)

        self.action_size = action_size
        self.N = N
        self.values = torch.linspace(Vmin, Vmax, N).view(1, 1, -1).to('cuda') #(1, 1, N)

    def forward(self, state):
        x = state #(batch_size, state_size)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x) #(batch_size, action_size*N)
        x = x.view(-1, self.action_size, self.N) #(batch_size, action_size, N)
        log_probs = F.log_softmax(x, dim=2) #(batch_size, action_size, N)
        Q_values = log_probs.exp() * self.values #(batch_size, action_size, N)
        Q_values = Q_values.sum(dim=2, keepdims=False) #(batch_size, action_size)

        return log_probs, Q_values

# Data Loading

In [2]:
stock_df_train, stock_df_test, stock_df_train_, stock_df_test_, codes = get_data()

100%|███████████████████████████████████████████████████████████████████████████████| 502/502 [00:00<00:00, 666.38it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 735.94it/s]


# Replay Buffer

In [3]:
from collections import deque

In [4]:
# from networks import *

import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np


class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, N, Vmin, Vmax, device, visual=False, personality=1):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.N = N
        self.Vmin = Vmin
        self.Vmax = Vmax
        self.vals = torch.linspace(Vmin, Vmax, N).to(device)
        self.unit = (Vmax - Vmin) / (N - 1)
        self.personality=personality

        self.Q_local = Q_Network(self.state_size, self.action_size, N, Vmin, Vmax).to(self.device)
        self.Q_target = Q_Network(self.state_size, self.action_size, N, Vmin, Vmax).to(self.device)

        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = deque(maxlen=100000)

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                _, action_values = self.Q_local(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        experiences = random.sample(self.memory, self.bs)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float().to(self.device)
        rewards[rewards<0] = rewards[rewards<0]*self.personality
        # print(states)
        # print(self.Q_local)
        log_probs, _ = self.Q_local(states) #(batch_size, action_size, N)
        log_probs = torch.gather(input=log_probs, dim=1, index=actions.unsqueeze(1).repeat(1, 1, self.N)) #(batch_size, 1, N)

        with torch.no_grad():
            log_probs_targets, Q_targets = self.Q_target(next_states)
            _, actions_target = torch.max(input=Q_targets, dim=1, keepdim=True)#(batch_size, 1) the same size as actions
            log_probs_targets = torch.gather(input=log_probs_targets, dim=1, index=actions_target.unsqueeze(1).repeat(1, 1, self.N))
            target_distribution = self.update_distribution(log_probs_targets.exp(), rewards, dones) #(batch_size, 1, N)

        loss = -target_distribution*log_probs #D_KL(target||local)
        #loss = -log_probs.exp()*((target_distribution+1e-9).log() - log_probs) #D_KL(local||target)

        loss = loss.sum(dim=2, keepdims=False).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_distribution(self, old_distribution, reward, dones):
        with torch.no_grad():
            reward = reward.view(-1, 1)
            batch_size = reward.size(0)
            assert old_distribution.size(0) == batch_size
            new_vals = self.vals.view(1, -1) * self.gamma * (1-dones) + reward
            new_vals = torch.clamp(new_vals, self.Vmin, self.Vmax)
            lower = torch.floor((new_vals - self.Vmin) / self.unit).long().to(self.device)
            upper = torch.min(lower + 1, other=torch.tensor(self.N - 1)).to(self.device)
            lower_vals = self.vals[lower]
            lower_probs = 1 - torch.min((new_vals - lower_vals) / self.unit, other=torch.tensor(1, dtype=torch.float32)).to(self.device)
            transit = torch.zeros((batch_size, self.N, self.N)).to(self.device)
            first_dim = torch.tensor(range(batch_size), dtype=torch.long).view(-1, 1).repeat(1, self.N).view(-1).to(self.device)
            second_dim = torch.tensor(range(self.N), dtype=torch.long).repeat(batch_size).to(self.device)
            transit[first_dim, second_dim, lower.view(-1)] += lower_probs.view(-1)
            transit[first_dim, second_dim, upper.view(-1)] += 1 - lower_probs.view(-1)
            if len(old_distribution.size()) == 2:
                old_distribution = old_distribution.unsqueeze(1)
            return torch.bmm(old_distribution, transit)

    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [5]:
env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, train=True, code='META', time_period = time_period, codes=codes)
env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, train=False, code='META', time_period = time_period, codes=codes)
agent = Agent(2*3, env.action_space, 64, 0.001, 0.001, 0.99, 51, -0.1, 0.1, 'cuda', True,personality = 2)

In [6]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 500
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.997
eps_min = 0.01
C = 4 # update weights every C steps

def validation(env, agent):
    rewards_log = []
    average_log = []
    episodic_reward = 0
    done = False
    t = 0
    state = env.reset()
    while not done and t < max_t:
        t += 1
        action = agent.act(state, eps)
        frame, reward, done = env.step(action)
        next_state = frame
        state = next_state.copy()
        episodic_reward += reward
        rewards_log.append(reward)
    sharpe = qs.stats.sharpe(pd.DataFrame(rewards_log))
    return env.asset, episodic_reward, sharpe

def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    rewards_log = []
    average_log = []
    state_history = []
    action_history = []
    done_history = []
    reward_history = []
    validation_log = []
    validation_average_log = []
    sharpe_log = []
    average_sharpe = []
    eps = eps_init

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward
        
        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        val_asset, val_reward, val_sharpe = validation(env_test, agent)

        validation_log.append(val_reward)
        validation_average_log.append(np.mean(validation_log[-100:]))
        sharpe_log.append(val_sharpe.values[0])
        average_sharpe.append(np.mean(sharpe_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}, valReward {:.3f}, val Average Reward {:.3f}, Asset {:.2f}, Validation Asset {:.2f}, Average Validation Sharpe {:.2f}'.format(i, episodic_reward, average_log[-1], val_reward, validation_average_log[-1], env.asset, val_asset, average_sharpe[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [7]:
# codes_dict = dict(zip(codes, range(len(codes))))
# train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

In [8]:
codes_dict = dict(zip(codes, range(len(codes))))
for code in codes:
    print(code, ' Begins')
    print('---------------------------------------------')
    env = Stock_Env(1000000, stock_df_train, 0.001, time = [x[0] for x in stock_df_train.index], record = stock_df_train_, codes_dict=codes_dict, train=True, code=code, time_period = time_period, codes=codes)
    env_test = Stock_Env(1000000, stock_df_test, 0.001, time = [x[0] for x in stock_df_test.index], record = stock_df_test_, codes_dict=codes_dict, train=False, code=code, time_period = time_period,  codes=codes)
    agent = Agent(2*3, env.action_space, 64, 0.001, 0.001, 0.99, 51, -0.1, 0.1, 'cuda', True,personality = 1)
    train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

AAPL  Begins
---------------------------------------------


NameError: name 'time_period' is not defined

In [None]:
time_period