# Recurent Deterministic Policy Gradient

RDPG is a natural extension of the DDPG, where an some recurrency is added to the model, so that it can use and adjust using past events. 

Especially beneficial in the case of partially observed enviroments

The changes from the DDPG architecture seem to limit themselves to the memory buffer and the actor-critic architecture

## Action Noise

In [1]:
import numpy as np

class ActionNoise:

    def __init__(self, mu, theta=0.15, sigma=0.2, x0=None, dt=0.05):
        self.theta = theta
        self.sigma = sigma
        self.mu = mu  # will be initialized as a list of zeros
        self.x0 = x0
        self.dt = dt  # same as flight model

        self.reset()  # sets x_prev

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(
            self.dt) * np.random.normal(size=self.mu.shape)
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

## Replay Buffer

In [2]:
import random
from collections import namedtuple

Transition = namedtuple('Transition', ('state', 'action', 'done', 'next_state', 'reward'))

class ReplayBuffer(object):

    def __init__(self, size, mini_batch_size):
        self.size = size
        self.mini_batch_size = mini_batch_size
        self.memory = []
        self.position = 0

    def add(self, *args):
        """Add transition to buffer"""

        # This is a strange way to implement it, but makes it FIFO efficiently
        if len(self.memory) < self.size:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = int((self.position + 1) % self.size)

    def sample(self):
        """Get a minibatch from buffer"""
        return random.sample(self.memory, self.mini_batch_size)

    def sample_trajectory(self):
        """Get a random batch of consecutive results"""
        start = random.randint(0, self.size-self.mini_batch_size)
        return self.memory[start :start+self.mini_batch_size]

    def __len__(self):
        return len(self.memory)

    def __repr__(self):
        return "Memory buffer used for learning, takes in a tuple: ('state', 'action', 'done', 'next_state', 'reward')"



## Fan-In Initialization

In [3]:
import torch
import torch.nn as nn


def fan_in_init(tensor, fan_in=None):
    # Either of the above inputs works

    if fan_in is None:
        fan_in = tensor.size(-1)

    w = 1. / np.sqrt(fan_in)
    return nn.init.uniform_(tensor, -w, w)

## Enviroment Normalization

In [4]:
import gym


class NormalizedEnv(gym.ActionWrapper):

    def __init__(self, env):
        super().__init__(env)
        self.env = env

    def step(self, action):
        act_k = (self.action_space.high - self.action_space.low) / 2
        act_b = (self.action_space.high + self.action_space.low) / 2

        return act_k * action + act_b

    def reverse_action(self, action):
        act_k_inv = 2. / (self.action_space.high - self.action_space.low)
        act_b_inv = (self.action_space.high + self.action_space.low) / 2
        return act_k_inv * (action - act_b_inv)

## Actor Network

In [5]:
class Actor(nn.Module):

    def __init__(self,num_inputs, action_space, init_w=3e-3, init_b=3e-4, hidden1=400, hidden2=300):
        super(Actor, self).__init__()

        self.action_space = action_space
        self.num_ouptuts = action_space.shape[0]

        # original model has lstm after second layer, but i think it might work better between hidden1 and hidden2

        self.fc1 = nn.Linear(num_inputs, hidden1)
        self.fcn1 = nn.LayerNorm(hidden1)

        self.lstm = nn.LSTMCell(hidden1, hidden1)

        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fcn2 = nn.LayerNorm(hidden2)

        self.fc3 = nn.Linear(hidden2, self.num_ouptuts)

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # initialize weights
        self.fc1.weight.data = fan_in_init(self.fc1.weight)
        self.fc1.bias.data = fan_in_init(self.fc1.bias)

        self.fc2.weight.data = fan_in_init(self.fc2.weight)
        self.fc2.bias.data = fan_in_init(self.fc2.bias)

        # The final layer weights and biases were initialized from uniform [-3e-3, 3e-3]
        self.fc3.weight.data.uniform_(-init_w, init_w)
        self.fc3.bias.data.uniform_(-init_b, init_b)

        #initalized the lstm with zeros (empty memory)
        self.cx = torch.zeros(1,hidden1)
        self.hx = torch.zeros(1,hidden1)

    
    def reset_lstm(self, done=True):
        # reset the data held in the lstm, should be done everytime the simulation is restarted
        # or after every batch
        if done:
            self.cx = self.cx = torch.zeros(1,hidden1)
            self.hx = torch.zeros(1,hidden1)

        else:
            self.cx = torch.Tensor(self.cx.data)
            self.hx = torch.Tensor(self.cx.data)


    def forward(self, x, hidden_states=None):
        x = self.fc1(x)
        x = self.fcn1(x)
        x = self.relu(x)

        if hidden_states == None:
            hx, cx = self.lstm(x.reshape(self.hx.shape), (self.hx, self.cx))
        else:
            hx, cx = self.lstm(x, hidden_states)

        self.hx = hx
        self.cx = cx
        x = hx

        # add relu layer here?

        x = self.fc2(x)
        x = self.fcn2(x)
        x =self.relu(x)

        x = self.fc3(x)
        x = self.tanh(x)

        return x, (self.hx, self.cx)    

## Critic Network

Add the lstm layer before or after the action have been added??

Here i'm going with after, makes more sense

In [6]:
class Critic(nn.Module):
    
    def __init__(self, num_inputs, action_space, init_w=3e-3, hidden1=400, hidden2=300, init_b=3e-4):
        super(Critic, self).__init__()
        self.num_inputs = num_inputs
        self.action_space = action_space
        self.num_outputs = action_space.shape[0]

        # Build the architecture of the critic
        self.fc1 = nn.Linear(num_inputs, hidden1)
        self.fcn1 = nn.LayerNorm(hidden1)

        self.fc2 = nn.Linear(hidden1 + self.num_outputs, hidden2)
        self.fcn2 = nn.LayerNorm(hidden2)

        self.lstm = nn.LSTMCell(hidden2, hidden2)

        self.fc3 = nn.Linear(hidden2, 1)
        self.relu = nn.ReLU()

        self.fc1.weight.data = fan_in_init(self.fc1.weight)
        self.fc1.bias.data = fan_in_init(self.fc1.bias)

        self.fc2.weight.data = fan_in_init(self.fc2.weight)
        self.fc2.bias.data = fan_in_init(self.fc2.bias)

        self.fc3.weight.data.uniform_(-init_w, init_w)
        self.fc3.bias.data.uniform_(-init_b, init_b)

        #initalized the lstm with zeros (empty memory)
        self.cx = torch.zeros(1,hidden2)
        self.hx = torch.zeros(1,hidden2)

    def forward(self, state, action, hidden_states=None):
        x = state
        x = self.fc1(x)
        x = self.fcn1(x)
        x = self.relu(x)

        # TODO: changed the axis form 1 to 0, seems to work, might cause issues later
        x = self.fc2(torch.cat([x, action], 0))
        x = self.fcn2(x)
        x = self.relu(x)

        if hidden_states == None:
            hx, cx = self.lstm(x.reshape(self.hx.shape), (self.hx, self.cx))
        else:
            hx, cx = self.lstm(x, hidden_states)

        self.hx = hx
        self.cx = cx
        x = hx

        out = self.fc3(x)
        return out, (self.hx, self.cx)

    def reset_lstm(self, done=True):
        # reset the data held in the lstm, should be done everytime the simulation is restarted
        # or after every batch
        if done:
            self.cx = self.cx = torch.zeros(1,hidden1)
            self.hx = torch.zeros(1,hidden1)

        else:
            self.cx = torch.Tensor(self.cx.data)
            self.hx = torch.Tensor(self.cx.data)

In [7]:
def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

In [8]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

In [9]:
import torch.nn.functional as F
import gc
import os

In [26]:
class RDPG(object):

    def __init__(self, num_inputs, action_space, checkpoint_dir=None):
        self.num_inputs = num_inputs
        self.num_outputs = action_space.shape[0]
        self.action_space = action_space

        # Hyperparameters
        self.lr_actor = 10e-4
        self.lr_critic = 10e-3
        self.buffer_size = 10e6
        self.batch_size = 64
        self.noise_mean = np.zeros(self.num_outputs)
        self.tau = 0.001
        self.gamma = 0.99
        self.weight_decay = 0.01

        # create actor critic networks
        self.actor = Actor(self.num_inputs, action_space)
        self.critic = Critic(self.num_inputs, action_space)

        # create target networks
        self.target_actor = Actor(self.num_inputs, action_space)
        self.target_critic = Critic(self.num_inputs, action_space)

        # ensure that the weights of the targets are the same as the actor critic
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # set up the optimizers
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr_critic,
                                             weight_decay=self.weight_decay)

        # create replay buffer and noise
        self.buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.noise = ActionNoise(self.noise_mean)

        # Set the directory to save the models
        if checkpoint_dir is None:
            self.checkpoint_dir = "./rdpg_saves/"
        else:
            self.checkpoint_dir = checkpoint_dir
        os.makedirs(self.checkpoint_dir, exist_ok=True)

    def update(self, hx, cx):
        batch = Transition(*zip(*self.buffer.sample()))

        state_batch = torch.cat(batch.state).float()
        action_batch = torch.cat(batch.action).float()
        reward_batch = torch.cat(batch.reward).float()
        done_batch = torch.cat(batch.done).float()
        next_state_batch = torch.cat(batch.next_state).float()

        next_action, (next_hx, next_cx) = self.target_actor(next_state_batch, (hx, cx))
        next_q, (h1, c1) = self.target_critic(next_state_batch, next_action.detach(), (hx, cx))

        reward_batch = reward_batch.unsqueeze(1)
        done_batch = done_batch.unsqueeze(1)
        exp_values = reward_batch + (1 - done_batch) * self.gamma * next_qs

        self.critic_optim.zero_grad()
        state_action_batch = self.critic(state_batch, action_batch, (next_hx, next_cx))
        value_loss = F.mse_loss(state_action_batch, exp_values.detach())
        value_loss.backward()
        self.critic_optim.step()   

        self.actor_optim.zero_grad()
        policy_loss = - self.critic(state_batch, self.actor(state_batch), (next_hx, next_cx))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()    

        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

        return value_loss.item(), policy_loss.item(), (next_hx, next_cx)
    
    def get_action(self, state, add_noise=True):
        self.actor.eval()  # puts actor into evaluation mode, ie not training any more, this means for eg that dropout layers dont dropout etc

        with torch.no_grad():
            # torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up …
            mu = self.actor(state)[0]

        self.actor.train()  # return actor to train mode, undos eval mode

        if add_noise:
            mu += self.noise()
        # return np.clip(mu, -1 ,1)
        return mu.clamp(self.action_space.low[0], self.action_space.high[0])

    def random_action(self):
        action = np.random.uniform(-1, 1, self.num_inputs)
        return action


    def set_eval(self):
        # set all agents to evaluation mode
        self.actor.eval()
        self.critic.eval()
        self.target_actor.eval()
        self.target_critic.eval()

    def set_train(self):
        # set all agents to training mode
        self.actor.train()
        self.critic.train()
        self.target_actor.train()
        self.target_critic.train()

    def save(self, last_time):
        save_path = self.checkpoint_dir + f'/ep{last_time}.pth.tar'
        print('Saving...')
        checkpoint = {
            'last_timestep': last_time,
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'actor_target': self.target_actor.state_dict(),
            'critic_target': self.target_critic.state_dict(),
            'actor_optim': self.actor_optim.state_dict(),
            'critic_optim': self.critic_optim.state_dict(),
            'memory': self.memory
        }
        torch.save(checkpoint, save_path)
        # Garbage collection, reclaims some memory
        gc.collect()
        print(f"Model saved: {last_time},  {save_path}")

    def load(self, path=None):
        # Loads checkpoint
        if path is None:
            path = self.get_path()

        if os.path.isfile(path):
            print("Loading checkpoint...")

        checkpoint = torch.load(path)
        timestep = checkpoint['last_timestep'] + 1

        self.actor.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.target_actor.load_state_dict(checkpoint['actor_target'])
        self.target_critic.load_state_dict(checkpoint['critic_target'])
        self.actor_optim.load_state_dict(checkpoint['actor_optim'])
        self.critic_optim.load_state_dict(checkpoint['critic_optim'])
        replay_buffer = checkpoint['memory']

        gc.collect()
        print('Model Loaded')
        return timestep, replay_buffer

    def get_path(self):
        # Gets the path of the latest file
        files = [file for file in os.listdir(self.checkpoint_dir) if (file.endswith(".pt") or file.endswith("tar"))]
        path = [os.path.join(self.checkpoint_dir, file) or file in files]
        last_file = max(path, key=os.path.getctime)
        return os.path.abspath(last_file)



In [27]:
import gym_Boeing
import matplotlib.pyplot as plt

env = gym.make('boeing-danger-v0')
# env = NormalizedEnv(env)

In [28]:
agent = RDPG(3, action_space=env.action_space)

In [29]:
n_test_cycles = 10
warmup = 1000

In [30]:
import time

agent.set_train()

timestep = 1
rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], []
epoch = 0
t = 0
time_last_checkpoint = time.time()

while timestep <= 100:
    agent.noise.reset()
    epoch_return = 0.
    state = torch.Tensor([env.reset()])

    hx = agent.actor.hx
    cx = agent.actor.cx

    while True:
        action = agent.get_action(state)
        print(env.step(action.numpy()))
        next_state, reward, done, _ = env.step(action.numpy())
        
        print(done, reward, _)
        timestep += 1
        epoch_return += reward

        mask = torch.Tensor([done])
        reward = torch.Tensor([reward])
        next_state = torch.Tensor([next_state])

        agent.buffer.add(state, action, mask, next_state, reward)

        state = next_state

        epoch_value_loss = 0
        epoch_policy_loss = 0

        if len(agent.buffer) > agent.buffer.mini_batch_size:
            value_loss, policy_loss, (hx, cx) = agent.update(hx, cx)

            epoch_value_loss += value_loss
            epoch_policy_loss += policy_loss

        if done:
            break

    rewards.append(epoch_return)
    value_losses.append(epoch_value_loss)
    policy_losses.append(epoch_policy_loss)

    if timestep >= 10 * t:
        print('Epoch:', epoch)
        t += 1
        test_rewards = []

        for _ in range(n_test_cycles):
            state = torch.Tensor(env.reset())
            test_reward = 0
            while True:
                # this is a bit different form the implementation used above, although it does the same job
                # this is due to a bug that instead of returning action:[[]], returns action:[] needing for
                # the action to be reshaped
                action = agent.get_action(state, add_noise=False)
                action = action.numpy()
                action = action.reshape((2,))
                next_state, reward, done, _ = env.step(action)
                print(done, _)
                test_reward += reward
                next_state = torch.Tensor([next_state])
                state = next_state
                if done:
                    break
            test_rewards.append(test_reward)
        mean_test_rewards.append(np.mean(test_reward))

    epoch += 1

    # save model
    agent.save(timestep, agent.buffer.memory)
    env.close()

TypeError: Parameter ``U``: Wrong element data type: 'object'. Array elements must be numbers.