In [92]:
#prepare env
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from custom_trading_env import TradingEnv
from utils import device
import DQNTradingAgent.dqn_agent as dqn_agent
from custom_hyperparameters import hyperparams
from arguments import argparser
from collections import deque, namedtuple

In [3]:
class args:
    def __init__(self):
        self.agent_num = 1
        self.device_num = 0
        self.save_num = 1
        self.risk_aversion = 1.
        self.n_episodes = 1000
        self.i = 1.
        self.fee = .0001
        self.render = False

args = args()
# device_num, save_num, risk_aversion, n_episodes, fee

device = torch.device("cuda:{}".format(args.device_num))
dqn_agent.set_device(device)

save_location = 'saves/{}'.format(args.save_num)

if not os.path.exists(save_location):
    os.makedirs(save_location)

save_interval  = 200
print_interval = 1

n_episodes   = args.n_episodes
sample_len   = 480
obs_data_len = 192
step_len     = 1
fee          = args.fee
sell_at_end  = False

risk_aversion_multiplier = 0.5 + args.risk_aversion / 2

n_action_intervals = 5

init_budget = 1

torch.save(hyperparams, os.path.join(save_location, "hyperparams.pth"))

df = pd.read_hdf('dataset/binance_data_train.h5', 'STW')
df.fillna(method='ffill', inplace=True)

In [5]:
env = TradingEnv(custom_args=args, env_id='custom_trading_env', obs_data_len=obs_data_len, step_len=step_len, sample_len=sample_len,
                     df=df, fee=fee, initial_budget=1, n_action_intervals=n_action_intervals, deal_col_name='c',
                     sell_at_end=sell_at_end,
                     feature_names=['o', 'h','l','c','v',
                                    'num_trades', 'taker_base_vol'])

[2019-07-24 13:54:57,722] Making new env: custom_trading_env


## LSTM practice

In [31]:
# lstm = nn.LSTM(3,6,batch_first = True)
inputs = torch.randn(5,2,3)

In [40]:
gru = nn.GRU(3,6,batch_first=True)

In [43]:
out, hidden = gru(inputs)
out.shape

torch.Size([5, 2, 6])

In [44]:
hidden.shape

torch.Size([1, 5, 6])

In [45]:
# hidden = (torch.zeros(1,5,6),torch.zeros(1,5,6)) #(time_step,batch_size, out_shape)
# lstm(inputs, hidden )
out, hidden = lstm(inputs)

In [46]:
out.shape # batch, timestep, feature

torch.Size([5, 2, 6])

In [47]:
hidden[1].shape # 1, timestep, feature

torch.Size([1, 5, 6])

## Attention

In [48]:
class MultiHeadedAttention(nn.Module):
    """Most part of the implementation from http://nlp.seas.harvard.edu/2018/04/03/attention"""
    def __init__(self, h, d_model):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.attn = None

        self.query_linear  = nn.Linear(d_model, d_model)
        self.key_linear    = nn.Linear(d_model, d_model)
        self.value_linear  = nn.Linear(d_model, d_model)
        self.output_linear = nn.Linear(d_model, d_model)

    def forward(self, query, key, value):
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query = self.query_linear(query).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        key   = self.key_linear(key).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        value = self.value_linear(value).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = self.attention(query, key, value)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.output_linear(x)

    @staticmethod
    def attention(query, key, value):
        "Compute 'Scaled Dot Product Attention'"
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        p_attn = F.softmax(scores, dim=-1)
        return torch.matmul(p_attn, value), p_attn

In [68]:
#test attention
class AttentionFFLayer(nn.Module):
        def __init__(self, in_out_size, h=8):
            assert in_out_size % h == 0, "The input size must be divisible by the number of attention heads `h`."
            super().__init__()
            self.in_out_size = in_out_size
            self.h = h

            self.norm0 = nn.LayerNorm(in_out_size)
            self.attn  = MultiHeadedAttention(h, in_out_size)
            self.norm1 = nn.LayerNorm(in_out_size)
            self.fc    = nn.Sequential(
                nn.Linear(in_out_size, in_out_size * 4),
                nn.ReLU(),
                nn.Linear(in_out_size * 4, in_out_size)
            )

        def forward(self, x):
            x = x + self.attn(*([self.norm0(x)]*3)) # same input for `query`, `key`, and `value`
            x = x + self.fc(self.norm1(x))
            return x

class FinalFFAttentionLayer(nn.Module):
    def __init__(self, in_out_size, h=8):
        assert in_out_size % h == 0, "The input size must be divisible by the number of attention heads `h`."
        super().__init__()
        self.in_out_size = in_out_size
        self.h = h

        self.norm0 = nn.LayerNorm(in_out_size)
        self.fc    = nn.Linear(in_out_size, in_out_size)
        self.norm1 = nn.LayerNorm(in_out_size)
        self.attn  = MultiHeadedAttention(h, in_out_size)
        self.norm2 = nn.LayerNorm(in_out_size)

    def forward(self, memory):
        assert memory.dim() == 3
        x = memory[:, -1:]
        x = x + self.fc(self.norm0(x))
        x = x + self.attn(self.norm1(x), memory, memory)
        return self.norm2(x).squeeze(1)


In [69]:
from DQNTradingAgent.default_hyperparameters import SEED, N_ATOMS, INIT_SIGMA, LINEAR, FACTORIZED
from DQNTradingAgent.default_hyperparameters import SEED, BUFFER_SIZE, BATCH_SIZE, START_SINCE,\
                                    GAMMA, T_UPDATE, TAU, LR, WEIGHT_DECAY, UPDATE_EVERY,\
                                    A, INIT_BETA, P_EPS, N_STEPS, V_MIN, V_MAX,\
                                    CLIP, N_ATOMS, INIT_SIGMA, LINEAR, FACTORIZED

In [79]:
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, bias=True, initial_sigma=INIT_SIGMA, factorized=FACTORIZED):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.initial_sigma = initial_sigma
        self.factorized = factorized
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        self.noisy_weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
            self.noisy_bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
            self.register_parameter('noisy_bias', None)
        self.reset_parameters()

        self.noise = True

    def reset_parameters(self):
        if self.factorized:
            sqrt_input_size = math.sqrt(self.weight.size(1))
            bound = 1 / sqrt_input_size
            nn.init.constant_(self.noisy_weight, self.initial_sigma / sqrt_input_size)
        else:
            bound = math.sqrt(3 / self.weight.size(1))
            nn.init.constant_(self.noisy_weight, self.initial_sigma)
        nn.init.uniform_(self.weight, -bound, bound)
        if self.bias is not None:
            nn.init.uniform_(self.bias, -bound, bound)
            if self.factorized:
                nn.init.constant_(self.noisy_bias, self.initial_sigma / sqrt_input_size)
            else:
                nn.init.constant_(self.noisy_bias, self.initial_sigma)

    def forward(self, input):
        if self.noise:
            if self.factorized:
                input_noise  = torch.randn(1, self.noisy_weight.size(1), device=self.noisy_weight.device)
                input_noise  = input_noise.sign().mul(input_noise.abs().sqrt())
                output_noise = torch.randn(self.noisy_weight.size(0), device=self.noisy_weight.device)
                output_noise = output_noise.sign().mul(output_noise.abs().sqrt())
                weight_noise = input_noise.mul(output_noise.unsqueeze(1))
                bias_noise = output_noise
            else:
                weight_noise = torch.randn_like(self.noisy_weight)
                bias_noise = None if self.bias is None else torch.randn_like(self.noisy_bias)

            if self.bias is None:
                return F.linear(
                           input,
                           self.weight.add(self.noisy_weight.mul(weight_noise)),
                           None
                       )
            else:
                return F.linear(
                           input,
                           self.weight.add(self.noisy_weight.mul(weight_noise)),
                           self.bias.add(self.noisy_bias.mul(bias_noise))
                       )
        return F.linear(input, self.weight, self.bias)

    def extra_repr(self):
        return 'in_features={}, out_features={}, bias={}, initial_sigma={}, factorized={}'.format(
            self.in_features, self.out_features, self.bias is not None, self.initial_sigma, self.factorized
        )


In [80]:
class QNetwork(nn.Module):
    def __init__(self, action_size, obs_len, num_features=16, n_atoms=N_ATOMS, linear_type=LINEAR, initial_sigma=INIT_SIGMA, factorized=FACTORIZED):
        """Initialize parameters and build model.
        Params
        ======
            action_size (int): Dimension of each action
            num_features (int): Number of features in the state
            n_atoms (int): number of support atoms
            linear_type (str): type of linear layers ('linear', 'noisy')
            initial_sigma (float): initial weight value for noise parameters
                when using noisy linear layers
        """
        super(QNetwork, self).__init__()
        self.action_size = action_size
        self.obs_len = obs_len
        self.num_features = num_features
        self.n_atoms = n_atoms
        self.linear_type = linear_type.lower()
        self.factorized = bool(factorized)

        def noisy_layer(in_features, out_features):
            return NoisyLinear(in_features, out_features, True, initial_sigma, factorized)
        linear = {'linear': nn.Linear, 'noisy': noisy_layer}[self.linear_type]

        self.init_hidden = nn.Parameter(data=torch.randn(2, 1, 64), requires_grad=True)
        self.init_cell   = nn.Parameter(data=torch.randn(2, 1, 64), requires_grad=True)

        # N * obs_len * num_features
        # x.view(-1, -1, x.size(-1))
        # (N * obs_len) * num_features
        self.fc0 = nn.Sequential(
            nn.BatchNorm1d(num_features),
            nn.Linear(num_features, 64)
            )
        # x.view(N, obs_len, x.size(-1))
        # N * obs_len * 64
        self.lstm  = nn.LSTM(64, 64, batch_first=True, bidirectional=True)
        # N * obs_len * 128
        self.inner_layers = nn.Sequential(
            AttentionFFLayer(128),
            AttentionFFLayer(128),
            AttentionFFLayer(128),
            FinalFFAttentionLayer(128)
        )
        # N * 128
        self.fc_s = nn.Sequential(
            linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            linear(64, n_atoms)
        )
        self.fc_a = nn.Sequential(
            linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            linear(64, action_size * n_atoms)
        )

    def forward(self, x):
        """Build a network that maps state -> action values."""
        x = self.fc0(x.view(-1, x.size(-1))).view(x.size(0), x.size(1), -1)
        x, _ = self.lstm(x, (self.init_hidden.repeat(1, x.size(0), 1), self.init_cell.repeat(1, x.size(0), 1)))

        x = self.inner_layers(x)

        state_value = self.fc_s(x)

        advantage_values = self.fc_a(x)
        advantage_values = advantage_values.view(advantage_values.size()[:-1] + (self.action_size, self.n_atoms))

        dist_weights = state_value.unsqueeze(dim=-2) + advantage_values - advantage_values.mean(dim=-2, keepdim=True)

        return dist_weights

    def noise(self, enable):
        enable = bool(enable)
        for m in self.children():
            if isinstance(m, NoisyLinear):
                m.noise = enable

In [83]:
## Agent
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, action_size, obs_len, num_features=16, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS,
                 v_min=V_MIN, v_max=V_MAX, clip=CLIP, n_atoms=N_ATOMS,
                 initial_sigma=INIT_SIGMA, linear_type=LINEAR, factorized=FACTORIZED, **kwds):
        """Initialize an Agent object.

        Params
        ======
            action_size (int): dimension of each action
            obs_len(int)
            num_features (int): number of features in the state
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            n_multisteps (int): number of steps to consider for each experience
            v_min (float): minimum reward support value
            v_max (float): maximum reward support value
            clip (float): gradient norm clipping (`None` to disable)
            n_atoms (int): number of atoms in the discrete support distribution
            initial_sigma (float): initial noise parameter weights
            linear_type (str): one of ('linear', 'noisy'); type of linear layer to use
            factorized (bool): whether to use factorized gaussian noise in noisy layers
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(action_size, int)
        assert isinstance(obs_len, int)
        assert isinstance(num_features, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        assert isinstance(n_multisteps, int) and n_multisteps > 0
        assert isinstance(v_min, (int, float)) and isinstance(v_max, (int, float)) and v_min < v_max
        if clip: assert isinstance(clip, (int, float)) and clip >= 0
        assert isinstance(n_atoms, int) and n_atoms > 0
        assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0
        assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy')
        assert isinstance(factorized, bool)

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.action_size         = action_size
        self.obs_len             = obs_len
        self.num_features        = num_features
        self.seed                = seed
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.n_multisteps        = n_multisteps
        self.v_min               = v_min
        self.v_max               = v_max
        self.clip                = clip
        self.n_atoms             = n_atoms
        self.initial_sigma       = initial_sigma
        self.linear_type         = linear_type.strip().lower()
        self.factorized          = factorized

        # Distribution
        self.supports = torch.linspace(v_min, v_max, n_atoms, device=device)
        self.delta_z  = (v_max - v_min) / (n_atoms - 1)

        # Q-Network
        self.qnetwork_local  = QNetwork(action_size, obs_len, num_features, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target = QNetwork(action_size, obs_len, num_features, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(buffer_size, batch_size, n_multisteps, gamma, a, False)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        #  experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, target_discount, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, target_discount)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        if self.qnetwork_local.training:
            switched = True
            self.qnetwork_local.eval()
        else:
            switched = False
        with torch.no_grad():
            z_probs       = F.softmax(self.qnetwork_local(state), dim=-1)
            action_values = self.supports.mul(z_probs).sum(dim=-1, keepdim=False)
        if switched:
            self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor for the target max-Q value

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            rows         = tuple(range(next_states.size(0)))
            a_argmax     = F.softmax(self.qnetwork_local(next_states), dim=2)\
                               .mul(self.supports)\
                               .sum(dim=2, keepdim=False)\
                               .argmax(dim=1, keepdim=False)
            p            = F.softmax(self.qnetwork_target(next_states)[rows, a_argmax], dim=1)
            tz_projected = torch.clamp(rewards + (1 - dones) * gamma * self.supports, min=self.v_min, max=self.v_max)
            # """
            b            = (tz_projected - self.v_min) / self.delta_z
            u            = b.ceil()
            l            = b.floor()
            u_updates    = b - l + u.eq(l).type(u.dtype) # fixes the problem when having b == u == l
            l_updates    = u - b
            indices_flat = torch.cat((u.long(), l.long()), dim=1)
            indices_flat = indices_flat.add(
                               torch.arange(start=0,
                                            end=b.size(0) * b.size(1),
                                            step=b.size(1),
                                            dtype=indices_flat.dtype,
                                            layout=indices_flat.layout,
                                            device=indices_flat.device).unsqueeze(1)
                           ).view(-1)
            updates_flat = torch.cat((u_updates.mul(p), l_updates.mul(p)), dim=1).view(-1)
            target_distributions = torch.zeros_like(p)
            target_distributions.view(-1).index_add_(0, indices_flat, updates_flat)


        pred_distributions = self.qnetwork_local(states)
        pred_distributions = pred_distributions.gather(dim=1, index=actions.view(-1, 1, 1).expand(-1, -1, pred_distributions.size(2))).squeeze(1)

        """
        cross_entropy = target_distributions.mul(pred_distributions.exp().sum(dim=-1, keepdim=True).log() - pred_distributions).sum(dim=-1, keepdim=False)
        new_priorities = cross_entropy.detach().add(self.priority_eps).cpu().numpy()
        loss = cross_entropy.mul(is_weights.view(-1)).mean()
        """
        kl_divergence = F.kl_div(F.log_softmax(pred_distributions, dim=-1), target_distributions, reduce=False).sum(dim=-1, keepdim=False)
        new_priorities = kl_divergence.detach().add(self.priority_eps).cpu().numpy()
        loss = kl_divergence.mul(is_weights.view(-1)).mean()
#         """

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.clip)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def to(self, device):
        self.qnetwork_local  = self.qnetwork_local.to(device)
        self.qnetwork_target = self.qnetwork_target.to(device)

In [88]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size, n_multisteps, gamma, a, separate_experiences):
        """Initialize a ReplayBuffer object.

        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            n_multisteps (int): number of time steps to consider for each experience
            gamma (float): discount factor
            a (float): priority exponent parameter
            separate_experiences (bool): whether to store experiences with no overlap
        """
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.n_multisteps = n_multisteps
        self.gamma = gamma
        self.a = a
        self.separate_experiences = bool(separate_experiences)

        self.memory_write_idx = 0
        self._non_leaf_depth = math.ceil(math.log2(buffer_size))
        self._memory_start_idx = 2 ** self._non_leaf_depth
        self._buffer_is_full = False
        self.memory = [None for _ in range(buffer_size)]
        self.priorities_a = np.zeros(buffer_size)
        self.tree = np.zeros(self._memory_start_idx + buffer_size) # starts from index 1, not 0; makes implementation easier and reduces many small computations

        self.multistep_collector = deque(maxlen=n_multisteps)
        self.max_priority_a = 1.
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

        self._divisors = np.power(2, np.arange(1, self._non_leaf_depth + 1)).reshape((-1, 1))
        self._discounts = np.power(self.gamma, np.arange(self.n_multisteps + 1))
        self._target_discount = float(self._discounts[-1])

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.multistep_collector.append(e)
        if len(self.multistep_collector) == self.n_multisteps:
            self.memory[self.memory_write_idx] = tuple(self.multistep_collector)

            delta_priority_a = self.max_priority_a - self.priorities_a[self.memory_write_idx]
            tree_idx = self._memory_start_idx + self.memory_write_idx
            self.priorities_a[self.memory_write_idx] = self.max_priority_a
            self.tree[tree_idx] = self.max_priority_a
            # tree_indices = np.floor_divide(tree_idx, self._divisors).reshape((-1,))
            # np.add.at(self.tree, tree_indices, np.tile(delta_priority_a, self._non_leaf_depth))
            for _ in range(self._non_leaf_depth):
                tree_idx = tree_idx // 2
                self.tree[tree_idx] += delta_priority_a

            self.memory_write_idx += 1
            if self.memory_write_idx >= self.buffer_size:
                self._buffer_is_full = True
                self.memory_write_idx = 0

            if self.separate_experiences:
                self.multistep_collector.clear()
        if done:
            self.multistep_collector.clear()

    def sample(self, beta):
        """Randomly sample a batch of experiences from memory.

        Params
        ======
            beta (int or float): parameter used for calculating importance-priority weights

        Returns
        =======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            target_discount (float): discount factor for target max-Q value
            is_weights (torch.Tensor): tensor of importance-sampling weights
            indices (np.ndarray): sample indices"""
        indices_not_prepared = True
        sample_value_basis = np.linspace(0, self.tree[1], num=self.batch_size, endpoint=False, dtype=np.float32)
        while indices_not_prepared:
            sample_values = np.add(sample_value_basis, np.multiply(np.random.rand(self.batch_size), sample_value_basis[1]))
            tree_indices = np.ones(self.batch_size, dtype=np.int32)
            try:
                for _ in range(self._non_leaf_depth):
                    left_child_indices = np.multiply(tree_indices, 2)
                    right_child_indices = np.add(left_child_indices, 1)
                    greater_than_left = np.greater(sample_values, self.tree[left_child_indices])
                    sample_values = np.where(greater_than_left, np.subtract(sample_values, self.tree[left_child_indices]), sample_values)
                    tree_indices = np.where(greater_than_left, right_child_indices, left_child_indices)
            except IndexError: # Don't know exactly why it occurs. Suspecting numerical error issues with floating numbers as a probable cause
                continue
            else:
                indices_not_prepared = False
        indices = np.subtract(tree_indices, self._memory_start_idx)

        experiences = tuple(zip(*[self.memory[i] for i in indices if self.memory[i] is not None]))

        first_states = torch.tensor([e[0] for e in experiences[0]], dtype=torch.float, device=device)
        actions      = torch.tensor([e[1] for e in experiences[0]], dtype=torch.long, device=device)
        rewards      = torch.tensor(
                           np.sum(
                               np.multiply(
                                   np.array([[e[2] for e in experiences_step] for experiences_step in experiences]).transpose(), self._discounts[:-1]
                               ), axis=1, keepdims=True
                           ), dtype=torch.float, device=device)
        last_states  = torch.tensor([e[3] for e in experiences[-1]], dtype=torch.float, device=device)
        dones        = torch.tensor([e[4] for e in experiences[-1]], dtype=torch.float, device=device).view(-1, 1)

        is_weights = np.divide(self.priorities_a[indices], self.tree[1])
        is_weights = np.power(np.multiply(is_weights, self.buffer_size if self._buffer_is_full else self.memory_write_idx), -beta)
        is_weights = torch.tensor(np.divide(is_weights, np.max(is_weights)).reshape((-1, 1)), dtype=torch.float, device=device)

        return (first_states, actions, rewards, last_states, dones), self._target_discount, is_weights, indices

    def update_priorities(self, indices, new_priorities):
        """Update the priorities for the experiences of given indices to the given new values.

        Params
        ======
            indices (array_like): indices of experience priorities to update
            new_priorities (array-like): new priority values for given indices"""
        # Remove Duplicate Samples (discard except the first occurence in the array)
        indices, idx_indices = np.unique(indices, return_index=True)
        new_priorities_a = np.power(new_priorities[idx_indices], self.a)

        delta_priority_a = np.subtract(new_priorities_a, self.priorities_a[indices])
        tree_indices = np.add(indices, self._memory_start_idx)
        self.priorities_a[indices] = new_priorities_a
        self.tree[tree_indices] = new_priorities_a
        tree_indices = np.floor_divide(tree_indices, self._divisors).reshape((-1,))
        np.add.at(self.tree, tree_indices, np.tile(delta_priority_a, self._non_leaf_depth))

        self.max_priority_a = np.max(self.priorities_a)

    def reset_multisteps(self):
        self.multistep_collector.clear()

    def __len__(self):
        """Return the current size of internal memory."""
        return self.buffer_size if self._buffer_is_full else self.memory_write_idx

In [93]:
#define model

agent = Agent(action_size=2 * n_action_intervals + 1, obs_len=obs_data_len, num_features=env.reset().shape[-1], **hyperparams)



  self.price = self.df_sample[self.price_name].as_matrix()
  self.obs_features = self.df_sample[self.using_feature].as_matrix()


In [105]:
beta = 0.4
beta_inc = (1 - beta) / 1000
agent.beta = beta
scores_list = []
loss_list = []
n_epi = 0

In [106]:
#train
# for n_epi in range(10000):  # 게임 1만판 진행
for i_episode in range(n_episodes):
    n_epi +=1

    if (i_episode + 1) % 500 == 0:
        sample_len += 480
        env.sample_len = sample_len

    state = env.reset()
    score = 0.
    actions = []
    rewards = []

    # for t in range(num_steps):
    while True:
        action = int(agent.act(state, eps=0.))
        next_state, reward, done, _ = env.step(action)

        rewards.append(reward)
        score += reward
        if reward < 0:
            reward *= risk_aversion_multiplier
        if sell_at_end and done:
            action = 2 * n_action_intervals
        actions.append(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        if done:
            break
    else:
        agent.memory.reset_multisteps()

    beta = min(1, beta + beta_inc)
    agent.beta = beta

    scores_list.append(score)

    if n_epi % print_interval == 0 and n_epi != 0:
        print_str = "# of episode: {:d}, avg score: {:.4f}\n  Actions: {}".format(n_epi, sum(scores_list[-print_interval:]) / print_interval, np.array(actions))
        print(print_str)
#         with open(os.path.join(save_location, "output_log.txt"), mode='a') as f:
#             f.write(print_str + '\n')

#     if n_epi % save_interval == 0:
#         torch.save(agent.qnetwork_local.state_dict(), os.path.join(save_location, 'TradingGym_Rainbow_{:d}.pth'.format(n_epi)))
#         torch.save(scores_list, os.path.join(save_location, 'scores.pth'))

del env

  self.price = self.df_sample[self.price_name].as_matrix()
  self.obs_features = self.df_sample[self.using_feature].as_matrix()


# of episode: 1, avg score: -0.1267
  Actions: [2 2 4 ... 2 2 2]
# of episode: 2, avg score: -0.1435
  Actions: [2 6 2 ... 2 4 4]
# of episode: 3, avg score: -0.1267
  Actions: [2 2 2 ... 7 5 2]




# of episode: 4, avg score: -0.2716
  Actions: [ 2  4  2 ...  2 10 10]
# of episode: 5, avg score: 0.0439
  Actions: [ 3  3  6 ... 10  5  7]
# of episode: 6, avg score: -0.0513
  Actions: [0 3 0 ... 0 0 0]
# of episode: 7, avg score: 0.2081
  Actions: [0 0 0 ... 0 0 0]


KeyboardInterrupt: 