# DDPG Solution to the BiPedalWalker problem

Thus far we have seen the use of DQN and DDQN networks to solve RL problems. Such algorithms are limited in that they can only chose a single scalar action per timestep, i.e. a CHOICE of what action to take. Such problems are limited in reality as we often need to output a range of values for each action per timestep (i.e. a vector action). 

An example would be, in controlling a robot arm, we do not need a binary choice of which servo to *turn on* but a list of locations for each servo.

Lets look at the actions spaces of two problems.

In [111]:
import gym
envList=['LunarLander-v2', 'BipedalWalker-v3']
for envName in envList:
    env = gym.make(envName)
    observation=env.reset()
    print('\n'+envName)
    print('action space is shaped like ',env.action_space)
    print('state space size is',env.observation_space.shape)


LunarLander-v2
action space is shaped like  Discrete(4)
state space size is (8,)

BipedalWalker-v3
action space is shaped like  Box(4,)
state space size is (24,)




## DDPG Methods

To start with we are going to create two networks, an actor and a critic. The actor is going to determine the action values in the action vector and the critic is going to evaluate the value of these actions.

(Note the code for this problem is heavily borrowed from  here: https://github.com/ghliu/pytorch-ddpg/blob/master/

In [18]:
class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma

class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

In [None]:
class DDPG:
    def __init__(self, config: Config):
        self.config = config
        self.init()

    def init(self):
        self.state_dim = self.config.state_dim
        self.action_dim = self.config.action_dim
        self.batch_size = self.config.batch_size
        self.gamma = self.config.gamma
        self.epsilon = self.config.epsilon
        self.is_training = True
        self.randomer = OUNoise(self.action_dim)
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.actor = Actor(self.state_dim, self.action_dim)
        self.actor_target = Actor(self.state_dim, self.action_dim)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.config.learning_rate_actor)

        self.critic = Critic(self.state_dim, self.action_dim)
        self.critic_target = Critic(self.state_dim, self.action_dim)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.config.learning_rate)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        if self.config.use_cuda:
            self.cuda()

    def learning(self):
        s1, a1, r1, t1, s2 = self.buffer.sample_batch(self.batch_size)
        # bool -> int
        t1 = (t1 == False) * 1
        s1 = torch.tensor(s1, dtype=torch.float)
        a1 = torch.tensor(a1, dtype=torch.float)
        r1 = torch.tensor(r1, dtype=torch.float)
        t1 = torch.tensor(t1, dtype=torch.float)
        s2 = torch.tensor(s2, dtype=torch.float)
        if self.config.use_cuda:
            s1 = s1.cuda()
            a1 = a1.cuda()
            r1 = r1.cuda()
            t1 = t1.cuda()
            s2 = s2.cuda()

        a2 = self.actor_target(s2).detach()
        target_q = self.critic_target(s2, a2).detach()
        y_expected = r1[:, None] + t1[:, None] * self.config.gamma * target_q
        y_predicted = self.critic.forward(s1, a1)

        # critic gradient
        critic_loss = nn.MSELoss()
        loss_critic = critic_loss(y_predicted, y_expected)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # actor gradient
        pred_a = self.actor.forward(s1)
        loss_actor = (-self.critic.forward(s1, pred_a)).mean()
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        # Notice that we only have gradient updates for actor and critic, not target
        # actor_optimizer.step() and critic_optimizer.step()

        soft_update(self.actor_target, self.actor, self.config.tau)
        soft_update(self.critic_target, self.critic, self.config.tau)

        return loss_actor.item(), loss_critic.item()


    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def decay_epsilon(self):
        self.epsilon -= self.config.eps_decay

    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).unsqueeze(0)

        if self.config.use_cuda:
            state = state.cuda()

        action = self.actor(state).detach()
        action = action.squeeze(0).cpu().numpy()
        action += self.is_training * max(self.epsilon, self.config.epsilon_min) * self.randomer.noise()
        action = np.clip(action, -1.0, 1.0)

        self.action = action
        return action

    def reset(self):
        self.randomer.reset()

    def load_weights(self, output):
        if output is None: return
        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))
        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))

    def save_config(self, output, save_obj=False):

        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

        if save_obj:
            file = open(output + '/config.obj', 'wb')
            pickle.dump(self.config, file)
            file.close()

    def save_checkpoint(self, ep, total_step, output):

        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)

        torch.save({
            'episodes': ep,
            'total_step': total_step,
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict()
        }, '%s/checkpoint_ep_%d.tar'% (checkpath, ep))


    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        episode = checkpoint['episodes']
        total_step = checkpoint['total_step']
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])

        return episode, total_step


In [None]:
import random
import numpy as np
import pickle

class ReplayBuffer(object):
    def __init__(self, buffer_size, random_seed=123):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = []
        random.seed(random_seed)

    def add(self, s, a, r, t, s2):
        # add an experience to the buffer.
        # Note t is a bool for terminal state.
        experience = (s, a, r, t, s2)
        # IF we are at less than capacity Add to buffer.
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
        else:
        # Otherwise do a LIFO style rotation.
            self.buffer.pop(0)
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch = np.array([_[0] for _ in batch])
        a_batch = np.array([_[1] for _ in batch])
        r_batch = np.array([_[2] for _ in batch])
        t_batch = np.array([_[3] for _ in batch])
        s2_batch = np.array([_[4] for _ in batch])

        return s_batch, a_batch, r_batch, t_batch, s2_batch

    def clear(self):
        self.buffer = []
        self.count = 0

    def save(self):
        file = open('replay_buffer.obj', 'wb')
        pickle.dump(self.buffer, file)
        file.close()

    def load(self):
        try:
            filehandler = open('replay_buffer.obj', 'rb')
            self.buffer = pickle.load(filehandler)
            self.count = len(self.buffer)
        except:
            print('there was no file to load')
            
    def __len__(self):
        return self.count

In [20]:
max_episodes=100

In [22]:
agent=Actor(),Critic
all_rewards = []
for ep in range(max_episodes):
    s0 = env.reset()
    agent.reset()

    done = False
    step = 0
    actor_loss, critics_loss, reward = 0, 0, 0

    # decay noise
    agent.decay_epsilon()

    while not done:
        action = agent.get_action(s0)

        s1, r1, done, info = env.step(action)
        agent.buffer.add(s0, action, r1, done, s1)
        s0 = s1

        if agent.buffer.size() > config.batch_size:
            loss_a, loss_c = agent.learning()
            actor_loss += loss_a
            critics_loss += loss_c

        reward += r1
        step += 1
        total_step += 1

        if step + 1 > config.max_steps:
            break

    all_rewards.append(reward)
    avg_reward = float(np.mean(all_rewards[-100:]))

AttributeError: 'tuple' object has no attribute 'reset'

In [None]:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from gym import wrappers, envs
import time

In [232]:

from misc import epsilon_threshold, plot_eps


import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('BipedalWalker-v3')

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

# Turn interactive mode on.
plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using', device, 'for simulation.')

Using cuda for simulation.




In [233]:
class Actor_Network(nn.Module):
    '''Actor network
    Inputs are ints of size'''
    def __init__(self, state, action,hidden1=150, hidden2=120):
        super(Actor_Network, self).__init__()
        self.FCL1 = nn.Linear(state,hidden1)
        self.FCL2 = nn.Linear(hidden1,hidden2)
        self.FCL3 = nn.Linear(hidden2,action)

    def forward(self, x):
        '''input states, return action'''
        x = F.relu(self.FCL1(x))
        x = F.relu(self.FCL2(x))
        x = torch.tanh(self.FCL3(x))
        return x
    
class Critic_Network(nn.Module):
    '''A Deep Q network for predicting actions given states'''
    def __init__(self, state, action, hidden1=150, hidden2=120):
        super(Critic_Network, self).__init__()
        self.FCL1 = nn.Linear(state,hidden1)
        self.FCL2 = nn.Linear(hidden1+action, hidden2)
        self.FCL3 = nn.Linear(hidden2,1)

    def forward(self, state, action):
        '''input stateaction, output value'''
        x = F.relu(self.FCL1(state))
        x = self.FCL2(torch.cat([x,action],dim=1))
        x = F.relu(x)
        x = self.FCL3(x)
        return x

In [234]:
'''A mapping of state-action pairs to next-state reward results'''
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):
    '''a cyclic buffer of bounded size that holds recently observed transitions.'''

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            # if we are at less than capacity, allocate fresh space for the transition
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        '''Randomly return a batch of batch_size from the memory'''
        return random.sample(self.memory, batch_size)
    
    def clear(self):
        self.memory = []
        self.position = 0

    def __len__(self):
        
        return len(self.memory)


In [235]:
def select_action(state, training=True):
    '''torch.tensor -> torch.tensor
    
    Chooses an epsilon-greedy action given an input state.'''
    action = policy_actor_net(state).detach()
    action = action.squeeze(0).cpu().numpy()
    action += training * max(epsilon, epsilon_min) #* noise()
    action = np.clip(action, -1.0, 1.0)
    return action

def soft_update(target, source, tau=0.001):
    """
    update target by target = tau * source + (1 - tau) * target
    :param target: Target network
    :param source: source network
    :param tau: 0 < tau << 1
    :return:
    """
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )    
    
def optimize_model():
    '''
    None -> float,float
    
    Update the actor and critic networks via SGD.
    returns actor loss and critic loss.
    '''
    if len(memory) < BATCH_SIZE:
        #print("Warning: We do not have enough history in memory to optimize our network")
        return
    
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch 
    batch = Transition(*zip(*transitions))
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    next_state_batch = torch.cat(batch.next_state)

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    
    # Compute A(s_{t+1}) for all next states.
    next_action_batch = torch.zeros((BATCH_SIZE,n_actions), device=device)
    next_action_batch[non_final_mask] = target_actor_net(non_final_next_states).detach()

    target_q = target_critic_net(next_state_batch, next_action_batch).detach()
    
    y_expected = reward_batch + gamma * target_q
    y_predicted = policy_critic_net.forward(state_batch, action_batch)

    # critic gradient
    critic_loss = nn.MSELoss()
    loss_critic = critic_loss(y_predicted, y_expected)
    critic_optimizer.zero_grad()
    loss_critic.backward()
    critic_optimizer.step()

    # actor gradient
    pred_a = policy_actor_net.forward(state_batch)
    loss_actor = (-policy_critic_net.forward(state_batch, pred_a)).mean()
    actor_optimizer.zero_grad()
    loss_actor.backward()
    actor_optimizer.step()
    
    soft_update(target_actor_net, policy_actor_net, tau=0.001)
    soft_update(target_critic_net, policy_critic_net, tau=0.001)
    #target_actor_net.parameters().data.copy_(target_actor_net.parameters().data*(1.0 -tau) + tau*policy_actor_net.parameters())
    #target_critic_net.parameters().data.copy_(target_critic_net.parameters().data*(1.0 -tau) + tau*policy_critic_net.parameters())
    
    return loss_actor.item(), loss_critic.item()
    
    

In [236]:
'''
ToDo:

implement noise.


'''

# Get number of actions from gym action space
n_actions = 4
# Get length of state space from gym observation space
n_states = 24

# Create two actor networks
policy_actor_net = Actor_Network(n_states, n_actions).to(device)
target_actor_net = Actor_Network(n_states, n_actions).to(device)
# Duplicate the weights and biases of the policy net into the target net.
target_actor_net.load_state_dict(policy_actor_net.state_dict())

# Create two critic networks
policy_critic_net = Critic_Network(n_states, n_actions).to(device)
target_critic_net = Critic_Network(n_states, n_actions).to(device)
# Duplicate the weights and biases of the policy net into the target net.
target_critic_net.load_state_dict(policy_critic_net.state_dict())

target_actor_net.eval()
target_critic_net.eval()

actor_optimizer = optim.Adam(policy_actor_net.parameters(),lr=0.001)
critic_optimizer = optim.Adam(policy_critic_net.parameters(),lr=0.001)
# Initialise the memory object.
memory = ReplayMemory(200000)
memory.clear()

steps_done = 0
episode_durations = []
reward_values = []
action_values=[]
frame_values=[]
env.close()

In [240]:
BATCH_SIZE = 512
# Discount Factor
GAMMA = 0.99
epsilon=0.2
epsilon_min=0.05

gamma=0.99


# How often do we update our policy network parameters (in steps)
TARGET_UPDATE = 10

In [241]:
num_episodes = 10
for i_episode in range(num_episodes):
    state = torch.from_numpy(np.cast['float32'](env.reset())).unsqueeze(0).to(device)
    TotalReward=0
    for t in count():
        action = select_action(state)
        
        next_state, reward, done, _ = env.step(action)
        
        env.render()
        TotalReward+= reward
        action = torch.from_numpy(np.cast['float32'](action)).unsqueeze(0).to(device)
        next_state = torch.from_numpy(np.cast['float32'](next_state)).unsqueeze(0).to(device)
        reward = torch.tensor([reward], device=device,dtype=torch.float32).unsqueeze(0)

        if not done:
            next_state = next_state
        else:
            next_state = None

        memory.push(state, action, next_state, reward)
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        
        if done:
            episode_durations.append(t + 1)
            reward_values.append(TotalReward)
            break
        
   # if i_episode % TARGET_UPDATE == 0:
        # Update the target network, copying all weights and biases in DQN
        #target_net.load_state_dict(policy_net.state_dict())

print('Complete')
#env.render()
env.close()
#plt.ioff()
#plt.show()
#plot_reward()









TypeError: expected Tensor as element 20 in argument 0, but got NoneType

In [239]:
env.close()

In [None]:
env.action_space.sample()

In [33]:
torch.zeros((24,4), device=device)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], device='cuda:0')