# OpenAI GYM CartPole-v1 problem

The description of the CartPole-v1 as given on the OpenAI gym website -

"""

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pendulum starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.

"""

This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson.

We train an agent to solve OpenAI Gym's Cartpole-v0 environment. The implementation is in the most recent version of the PyTorch frameworkfor building deep learning models.

https://medium.com/@thechrisyoon/deriving-policy-gradients-and-implementing-reinforce-f887949bd63

- Actor Critic Methods (A2C) - OpenAI Gym CartPole-v0 
https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

- Deep Deterministic Policy Gradients - OpenAI Gym  Pendulum-v0

https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b

In [6]:
# General libraries
import os
import io
import base64

import numpy as np
import math, random

from collections import namedtuple, deque

# Gym
import gym
import gym.wrappers

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T
from torch.distributions import Normal
from torch.distributions import Categorical
import torch.autograd as autograd


# MXNET
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn
import mxnet as mx

# Sklearn
from sklearn.neural_network import MLPClassifier

# Common utilities
from common.layers import NoisyLinear
from common.replay_buffer import ReplayBuffer
from common.replay_buffer import PrioritizedReplayBuffer
from common.multiprocessing_env import SubprocVecEnv
from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch

# Visualization
import matplotlib.pyplot as plt
from IPython import display
from IPython.display import HTML, clear_output

from pyvirtualdisplay import Display

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0) 
%matplotlib inline

In [None]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self, lr, n_actions, input_dims):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device) # device selection
        
    def forward(self, state):
        layer1  = F.relu(self.fc1(state))
        actions = self.fc2(layer1) # no activation necessary for regression=output
        
        return actions

In [None]:
class Agent():
    '''
    Agent is capable to choose actions the ability to learn from it's experiences
    as well as the ability to decrement the agent's epsilon over time.
    '''
    def __init__(self, input_dims, n_actions, lr=0.0001, gamma=0.99, epsilon=1.0, eps_dec=1e-5, 
                eps_min=0.01):
        self.lr = lr
        self.input_dims = input_dims
        self.n_actions  = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.action_space = [i for i in range(self.n_actions)]
        
        # Q-value function for the Agent 
        self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor(observation, dtype=T.float).to(self.Q.device)
            actions = self.Q.forward(state)
             # action with maximum Q-value
            action = T.argmax(actions).item() # dereference to numpy array for Gym API
        else:
            action = np.random.choice(self.action_space)
        
        return action
    
    def decrement_epsilon(self): # linear annealing
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
    
    def learn(self, state, action, reward, state_):
        self.Q.optimizer.zero_grad()
        states  = T.tensor(state, dtype=T.float).to(self.Q.device)
        actions = T.tensor(action).to(self.Q.device)
        rewards = T.tensor(reward).to(self.Q.device)
        states_ = T.tensor(state_, dtype=T.float).to(self.Q.device)
        
        # prediction valeus for the current state of the environment
        q_pred = self.Q.forward(states)[actions]
        
        # target value for the maximum action of the agents estimate of the value of the resulting states
        q_next = self.Q.forward(states_).max()
        
        # the target that is the direction we want to move in is going to be
        q_target = reward + self.gamma * q_next
        
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        loss.backward()
        self.Q.optimizer.step()
        self.decrement_epsilon()

In [None]:
env = gym.make('CartPole-v1')

### Random agent

In [None]:
total_reward = 0.0
total_steps = 0

obs = env.reset() # random initialization of environment
print(obs) # random observation

In [None]:
# run through episodes
while True:
    # select random action from action space
    action = env.action_space.sample()
    
    # execute selected action
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

### Random actionwrapper

In [None]:
class RandomActionWrapper(gym.ActionWrapper):
    def __init__(self, env, epsilon=0.1):
        super(RandomActionWrapper, self).__init__(env)
        self.epsilon = epsilon
    def action(self, action):
        if random.random() < self.epsilon:
            print("Random action taken!")
            return self.env.action_space.sample()
        # else original action taken
        return action

In [None]:
env_cartpole = gym.make("CartPole-v0")
env = RandomActionWrapper(env_cartpole)

In [None]:
total_reward = 0.0
total_steps = 0

obs = env.reset()
print(obs)

In [None]:
# run through episodes
while True:
    # select predefined action (move left)
    action = 0
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

### Cartpole random monitor

In [None]:
env = gym.make("CartPole-v0")
#env = gym.wrappers.Monitor(env, "log_recording", force=True) # log folder called recording

In [None]:
#obs = env.reset()
#print(obs)

In [None]:
# run through episodes
while True:
    # select random action from action space
    action = env.action_space.sample()
    
    obs, reward, done, _ = env.step(action)
    
    total_reward += reward
    total_steps += 1
    
    if done:
        break

print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
env.close()
env.env.close()

## CartPole solved with Cross-Entropy

In [None]:
# default parameter configuration
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70 # percentile of episodes' total rewards that are used for elite episode filtering(top 30% of episodes sorted by reward)

In [None]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
    def forward(self, x):
        return self.net(x)

In [None]:
# store the observation from environment and action agent completed for one step agent made in the episode
# use episode step from elite episodes as training data
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
# single episode stored as total undiscounted reward and a collection of episode
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [None]:
# Generates batches with episodes
def iterate_batches(env, net, batch_size): # count of episodes to generate on every generation. 
    batch = []
    episode_reward = 0.0
    episode_steps = []
    
    obs = env.reset()
    
    # softmax
    sm = nn.Softmax(dim=1)
    
    while True:
        obs_v = torch.FloatTensor([obs])
        # pass current obervation to the net, sample the action to perform,
        # ask the environment to process the action, and remember the result of this processing.
        # use softmax (sm) to convert the network's output to a probability distribution of actions
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        # sampling from action probability distribution
        action = np.random.choice(len(act_probs), p=act_probs)
        
        next_obs, reward, is_done, _ = env.step(action)
        
        episode_reward += reward
        
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        
        if is_done:
            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))

            episode_reward = 0.0
            episode_steps = []
            
            next_obs = env.reset()
            
            # in case batch reaches desired count of episodes return it to caller
            if len(batch) == batch_size:
                yield batch
                batch = []
        # assign observation obtained from environment to the current observation         
        obs = next_obs
        
def filter_batch(batch ,percentile):
    '''
    function at the core of the cross-entropy method, from the given batch episodes
    and percentile value, it calculates a boundary reward, which is used to 
    filter elite episodes to train on. 
    
    '''
    # from list of values and desired percentile 
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) # to obtain the boundary reward 
    reward_mean = float(np.mean(rewards)) # mean reward used for monitor
    
    train_obs = []
    train_act = []
    
    for example in batch: 
        # filter off episode for training 
        # for every episode in the batch, we will check that the episode has high total reward 
        # then our reward boundary and if it has, we will populate list of observations and actions
        # that we train on.
        if example.reward < reward_bound: 
            continue
        # observation and actions from elite episode  
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))
        
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    
    # reward boudary and reward mean only used to check and monitor agent performance
    return train_obs_v, train_act_v, reward_bound, reward_mean

In [None]:
env = gym.make("CartPole-v0")

In [None]:
To check the agent in action we enable Monitor to create videos recorded at different training steps.

In [None]:
env = gym.wrappers.Monitor(env, directory="mon", force=True)

In [None]:
obs_size  = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
# one-hidden-layer neural network, with ReLU and 128 hidden neurons
net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss() # expects raw scores (logit) and applies log-softmax on them as opposed to the log probabilities
optimizer = optim.Adam(params=net.parameters(), lr=0.01)

In [None]:
writer = SummaryWriter(comment="-cartpole")

The training of the neural network and the generation of the episodes are performed at the same time. They are not completely in parallel, but every time the loop accumulates enough episodes (16), it passes control to the function supposed to train the network using the gradient descent. The network will have different, slightly better behavior, hopefully.

We do not need to explore proper synchronization, as the training and data gathering activities are performed at the same thrad of execution, but need to understand those constant jumps from network training to its utilization.

In [None]:
# In training loop, we iterate over batches of episodes,
# then perform filtering of the elite episodes. 
# The result is variables of observations and taken actions, reward boundary 
# used for filtering and mean reward.
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
    obs_v, acts_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
    optimizer.zero_grad()
    # pass observations to the network, obtaining its actions scores.
    action_scores_v = net(obs_v)
    # These action scores are passed to the objection function, 
    # which calculates cross-entropy between the network output and the actions that 
    # the action agent took.
    loss_v = objective(action_scores_v, acts_v)
    
    loss_v.backward()
    optimizer.step()
    
    print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
            iter_no, loss_v.item(), reward_m, reward_b))
    
    # Agent's learning performance recorded
    # monitoring progress of interation number, loss, mean reward of batch, and reward boundary
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean",  reward_m, iter_no)
    
    # comparison of the mean rewards of the batch episodes solved when the mean reward 
    # for the last 100 episodes is greater than 195. 
    # Gym, environment considers
    if reward_m > 199:
        print("Solved!")
        break

    writer.close()

env.close()
env.env.close()

In [None]:
#if you see "<classname> has no attribute .env", remove .env or update gym
env = gym.make("CartPole-v0").env

In [None]:
env.reset()

In [None]:
env.reset()

In [None]:
#plt.imshow(env.render("rgb_array"))

In [None]:
# create agent
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='tanh',
                      warm_start=True, # keep progress between .fit(...) calls
                      max_iter=1 # make only 1 iteration on each .fit(...)
                     )

In [None]:
X_train = [env.reset()]*n_actions

In [None]:
y_train = list(range(n_actions))

In [None]:
# initialize agent to the dimension of state an amount of actions
#agent.fit([env.reset()]*n_actions, list(range(n_actions)));
agent.fit(X_train, y_train)

In [None]:
def generate_session(t_max=1000):
    
    states,actions = [],[]
    total_reward = 0
    
    s = env.reset()
    
    for t in range(t_max):
        
        # a vector of action probabilities in current state
        probs = agent.predict_proba([s])[0] 
        
        #a = <sample action with such probabilities>
        a = np.random.choice(2, 1, p=probs)[0]
        # Version 2.
        #a = get_action(s, epsilon=epsilon) 
        
        new_s, r, done, info = env.step(a)
        
        # Version 2.
        #epsilon=0
        #sess.run(train_step,{
        #        states_ph: [s], actions_ph: [a], rewards_ph: [r], 
        #        next_states_ph: [new_s], is_done_ph: [done]
        #    })
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward += r
        
        s = new_s
        if done: break
    return states, actions, total_reward

In [None]:
#epsilon = 0.5
#for i in range(1000):
#    session_rewards = [generate_session(epsilon=epsilon, train=True) for _ in range(100)]
#    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(i, np.mean(session_rewards), epsilon))
    
#    epsilon *= 0.99
#    assert epsilon >= 1e-4, "Make sure epsilon is always nonzero during training"
    
#    if np.mean(session_rewards) > 300:
#        print ("You Win!")
#        break

### Cross-entropy method (CEM) steps

Deep CEM uses exactly the same strategy as the regular CEM. The only difference is that now each observation is not a number but a float32 vector.

In [None]:
def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
    """
    Select states and actions from games that have rewards >= percentile
    :param states_batch: list of lists of states, states_batch[session_i][t]
    :param actions_batch: list of lists of actions, actions_batch[session_i][t]
    :param rewards_batch: list of rewards, rewards_batch[session_i][t]
    
    :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
    
    Please return elite states and actions in their original order 
    [i.e. sorted by session number and timestep within session]
    
    If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
    """
    
    #reward_threshold = <Compute minimum reward for elite sessions. Hint: use np.percentile>
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    
    #elite_states  = <your code here>
    #elite_actions = <your code here>
    # Version 1.
    elite_states  = [s for i in range(len(states_batch)) if rewards_batch[i]>=reward_threshold for s in states_batch[i]]    
    elite_actions = [a for i in range(len(actions_batch)) if rewards_batch[i]>=reward_threshold for a in actions_batch[i]]

    # Version 2.
    #elite_states  = [state for i in range(len(rewards_batch))   if rewards_batch[i]> reward_threshold for state in states_batch[i] ]
    #elite_actions = [action for i in range(len(rewards_batch))  if rewards_batch[i] > reward_threshold for action in actions_batch[i]]

    # Version 3.
    #elite_states  = list(chain(*[s for s, _ in zip(states_batch,rewards_batch) if _ >= reward_threshold])) 
    #elite_actions = list(chain(*[s for s, _ in zip(actions_batch,rewards_batch) if _ >= reward_threshold])) 

    # Version 4.
    #elite_states = []
    #elite_actions = []
    #for i in range(len(rewards_batch)):
    #    reward = rewards_batch[i]
    #    if reward>=reward_threshold:
    #        for state_element, action_element in zip(states_batch[i], actions_batch[i]):
    #            elite_states.append(state_element)
    #            elite_actions.append(action_element)
    
    return elite_states, elite_actions

In [None]:
## Training loop

Generate sessions, select N best and fit to those.

In [None]:
def show_progress(batch_rewards, log, percentile, reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    log.append([mean_reward, threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(batch_rewards, range=reward_range);
    plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    return plt.show()

In [None]:
n_sessions = 100
percentile = 70

In [None]:
%%time
log = []

for i in range(100):
    #generate new sessions
    #sessions = [<generate a list of n_sessions new sessions>]
    sessions = [generate_session() for _ in range(n_sessions)]

    batch_states, batch_actions, batch_rewards = map(np.array, zip(*sessions))

    #elite_states, elite_actions = <select elite actions just like before>
    # Version 1. 
    elite_states, elite_actions = select_elites(batch_states,
                                                batch_actions,
                                                batch_rewards,
                                                percentile=percentile)
    
    # Version 2. choose threshold on rewards
    #threshold = np.percentile(batch_rewards,percentile)
    #elite_states = np.concatenate(batch_states[batch_rewards>=threshold])
    #elite_actions = np.concatenate(batch_actions[batch_rewards>=threshold])

    #<fit agent to predict elite_actions(y) from elite_states(X)>
    agent.fit(elite_states, elite_actions)
    
    show_progress(batch_rewards, log, percentile, reward_range=[0,np.max(batch_rewards)])
    
    if np.mean(batch_rewards)> 190:
        print("You Win! You may stop training now via KeyboardInterrupt.")

In [None]:
#record sessions
env = gym.wrappers.Monitor(gym.make("CartPole-v0"), directory="videos", force=True)
sessions = [generate_session() for _ in range(100)]

In [None]:
env.close()

## Show video

In [None]:
video_names = list(filter(lambda s:s.endswith(".mp4"), os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

## Solving CartPole-V1 using REINFORCE Algorithm

In [None]:
class REINFORCE(object):
    def __init__(self, env, lr=1e-3, seed=42):
        """
        REINFORCE algorithm implementation.
        
        Args:
            env (Gym environment) : the environment that we are training our reinforcement learning.
            lr (float) : the learning rate used for to update the neural network.
            seed (int) : the random seed used to generate data from the environment.
        """

        self.env = env
        self.lr = lr
        self.seed = seed
        self.env.seed(self.seed)
        
        print('Random seed: {} '.format(seed))

        self.build_network()
        
        
    def build_network(self, hidden_size=20):
        """
        Build the neural network and set up the trainer.
        
        Args:
            hidden_size (int) : the size of the hidden layers in the neural network.
        """    
        
        self.policy_net = nn.Sequential()
        self.policy_net.add(nn.Dense(hidden_size, activation="relu"),
                            nn.Dense(hidden_size, activation="relu"),
                            nn.Dense(self.env.action_space.n))
        self.policy_net.initialize(init=init.Xavier())

        self.trainer = gluon.Trainer(self.policy_net.collect_params(), 'adam', {'learning_rate': self.lr})

        
    def update(self, lr_coeff=0.999):
        """
        Perform an update on a batch of data collected during an episode. It will also reduce the learning rate 
        after the update as a way to improve convergence.
        
        Args:
            lr_coeff (float) : the coefficient with which we multiply the current learning rate.
        """
        
        returns    = self.get_returns()
        batch_size = len(self.actions)

        with autograd.record():
            all_actions = nd.softmax(self.policy_net(nd.array(self.states[:-1])))
            
            loss = - nd.log(all_actions[np.array(range(batch_size)), np.array(self.actions)]) * returns

        loss.backward()
        
        self.trainer.step(batch_size)
        self.trainer.set_learning_rate(self.trainer.learning_rate * lr_coeff) 
      
    
    def predict(self,  state):
        """
        Output the probabilities for all actions and choose stochastically one of them.
        
        Args:
            state (array of floats) : the state for which we want to select an action.
        Returns:
            action (int) : the selected action given the state.
        """
        
        actions = nd.softmax(self.policy_net(nd.array([state]))).asnumpy()[0]

        return np.random.choice(len(actions), p=actions)
    
    
    def get_returns(self, discount_factor=0.99):
        """
        Calculate the return for every state. This is defined as the discounted 
        sum of rewards after visiting the state. 
        
        Args:
            discount_factor (float) : determines how much we care about distant 
                                        rewards (1.0) vs immediate rewards (0.).
        Returns:
            normalized_returns (array of float) : the returns, from which the mean is 
                                                 substracted to reduce the variance.
        """
        returns=[]
        curr_sum = 0.
        for r in reversed(self.rewards):
            curr_sum = r + discount_factor*curr_sum
            returns.append(curr_sum)
            
        returns.reverse()
        normalized_returns = nd.array(returns) - nd.mean(nd.array(returns))
        
        return normalized_returns
    
    
    def setup_saving(self):
        """
        Store results.
        
        Args:
            None
        
        Returns:
            printout of location of stored file.
        """
        
        directory= os.getcwd() + '/res/'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        save_file = "{}cartpole_seed{}.csv".format(directory,self.seed)
        
        return save_file, []   
    
    
    def initialize_episode(self):
        """
        Initialiazes the variables total_rewards, ewards, actions and states, and
        resets the environment.
        
        Returns:
            state (array of float) : the first state of the episode.
        """
        
        self.rewards,self.actions,self.states = [],[],[]
        self.total_rewards = 0.

        state = self.env.reset()
        self.states.append(state) 

        return state

    
    def add_to_trajectory(self, action, next_state, reward):
        """
        Stores in memory the action, next_state and reward. This will later be used for updates.
        
        Args:
            action (int) : the selected action in the current state.
            action (int) : the reward after selectin the action.
            next_state (array of floats) : the next state returned by the environment after selecting the action.
        Returns:
            next_state (array of float) : the next state returned by the environment after selecting the action.
        """
        
        self.total_rewards += reward
        
        self.rewards.append(reward)
        self.actions.append(action)   
        self.states.append(next_state)

        return next_state
    

    def fit(self, num_episodes=1000, save_every=5):
        """
        Implements the training loop. 
        
        Args:
            num_episodes (int) : the number of episodes we train the agent.
            save_every (int) : the rate at which we save the results, which will be used for visualization.
        """
        
        save_file, stats = self.setup_saving()

        for i_episode in range(num_episodes):
            if i_episode % save_every == 0 and i_episode != 0:
                np.savetxt(save_file,stats,delimiter=',') 

            state = self.initialize_episode()
            done=False
            t=0

            while not done:
                t+=1
                action = self.predict(state)
                next_state, reward, done, _ = self.env.step(action)
                state = self.add_to_trajectory(action, next_state, reward)
                if i_episode%50 ==0:self.env.render()

            print("\rEpisode {} Total Rewards {} ".format(i_episode, self.total_rewards) )
            stats.append(t)
            self.update()

In [None]:
env = gym.make("CartPole-v1")

In [None]:
REINFORCE(env).fit()

## Agent taking random actions

In [None]:
env = wrappers.Monitor(env, "./gym-results", force=True)
observation = env.reset()

In [None]:
for episode in range(200):
    for timestep in range(100):
        env.render()
    
        print(observation)
    
        # Here we’ve chosen to “sample” the action space to get a random action, of which,
        # there are only two: move left or move right.
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)  # take a random action

        #if env_opt:
        #    env.reset()
        if done:
            print("Completed after {} timesteps.".format(timestep + 1))
            break

In [None]:
# close environment
env.close() 

In [None]:
video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

What we want to do is train an agent to find a policy. we want to train our agent to find a good policy for the CartPole problem. Specifically, we want our agent to learn an ideally optimal policy that takes the four observation values and then make a decision as to what action to take (i.e. move right or move left) given the values the agent is observing at any given time.

In [None]:
Reading material:

- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-1/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-2/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-3/
- http://testerstories.com/2017/12/the-tester-role-in-machine-learning-part-4/

## Cartpole Environment Reinforcement Learning

We train an agent to solve OpenAI Gym's Cartpole-v0 environment. The implementation is in the most recent version of the PyTorch frameworkfor building deep learning models.

In [None]:
# Display
display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in plt.get_backend()

if is_ipython:
    from IPython import display

plt.ion()

#### Watch a Random Agent

It is useful to see how well the agent performs, before training train the agent. We can run the code cell below to watch how well the agent does, if it just pushes the cart randomly at each timestep. We can think of it as flipping a fair coin when deciding whether to push the cart to the left or to the right.

We run the code cell multiple times, to check the score for different episodes (or game rounds). It likely won't get a score above 30, and this is to be expected! Later we'll train the agent to consistently get a score larger than 195!

In [None]:
env = gym.make('CartPole-v0')
env.seed(0)

In [None]:
print('observation space:', env.observation_space)
print('action space:', env.action_space)

In [None]:
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))

In [None]:
for t in range(1000):

    action = env.action_space.sample()
    img.set_data(env.render(mode='rgb_array')) 

    plt.axis('off')

    display.display(plt.gcf())
    display.clear_output(wait=True)
    
    state, reward, done, _ = env.step(action)

    if done:
        print('Score: ', t+1)
        break
        
env.close()

#### Define the Architecture of the Policy

We will define a neural network that encodes the agent's stochastic policy.

CartPole environment details:

- The agent has two possible actions: it can either push the cart to the left or to the right.
- The state at each timestep always has four numbers, corresponding to the position and velocity of the cart, along with the pole angle and velocity.
- The network that you will define takes the environment state as input. It returns as output the probability that the agent should select each possible action.

For example, when the agent observes a new state, it passes the state as input to the network. The network returns two numbers, corresponding to the probability that the agent will select each action. So, for instance, if the network returns [0.9, 0.1], the agent pushes the car to the left with 90% probability, and otherwise pushes the car to the right. Then the agent samples from the action space using these probabilities - say it ends up selecting the action that pushes the cart to the left. After selecting this action, it sends the action to the environment and receives a reward and next state. This next state is then fed as input to the network, and so on.

When we initialize the neural network, all of the weights are random. Our agent's goal then will be to figure out the appropriate weights in the neural network, so that for each state, the network always outputs probabilities that encode a good game-playing strategy, and help the agent get a high score!

In [None]:
class Policy(nn.Module):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        """Neural network that encodes the policy.
        
        Params
        ======
            s_size (int): dimension of each state (also size of input layer)
            h_size (int): size of hidden layer
            a_size (int): number of potential actions (also size of output layer)
        """
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        
        return action.item(), m.log_prob(action)

### Train the Agent with Reinforcement Learning

We'll use the reinforcement algorithm, also known as Monte Carlo Policy Gradients, to guide the agent to train the weights of the neural network, while it's playing the game.

For now, run the training on the network. The OpenAI Gym considers the environment as "solved", if the average score over 100 episodes is at least 195.0.

In [None]:
policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

In [None]:
def reinforce(n_episodes=1000, max_t=200, gamma=1.0, print_every=100):
    """PyTorch implementation of the REINFORCE algorithm.
        
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
    """
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores

In [None]:
### Plot the Scores
scores = reinforce()
Run the code cell below to plot the scores that were received by the agent.

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
### Watch a Smart Agent
Finally, we can watch our smart agent! Doesn't it do much better than the random agent?

In [None]:
env = gym.make('CartPole-v0')

In [None]:
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array'))

In [None]:
for t in range(1000):
    action, _ = policy.act(state)
    img.set_data(env.render(mode='rgb_array')) 
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)
    state, reward, done, _ = env.step(action)
    if done:
        print('Score: ', t+1)
        break
        
env.close()

### Comment

We are now in the position of trying to train our agent using much more advanced reinforcement learning methods.

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_approx_qlearning.ipynb


https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/sarsa.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/qlearning.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_mcts.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_reinforce.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/practice_vi.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/crossentropy_method.ipynb

https://github.com/y2ee201/Coursera-Practical-RL-NRUSHE/blob/master/bandits.ipynb

## LinearDeepQNetwork

In [None]:
n_games = 10000
scores = []
eps_history = []

In [None]:
agent = Agent(input_dims=env.observation_space.shape,
              n_actions=env.action_space.n)

In [None]:
# iterate over the number of games
for i in range(n_games):
    score = 0
    done = False
    obs = env.reset()
    
    while not done:
        action = agent.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        score += reward
        # learn from the state, action, reward, and new state
        agent.learn(obs, action, reward, obs_)
        obs = obs_ # state old state to new state
    # at the end of every episode append the score
    scores.append(score)
    eps_history.append(agent.epsilon) # agents epsilon
    
    if i % 100 == 0:
        avg_score = np.mean(scores[-100:]) # last 100 episodes
        print('episode ', i, 'score %.1f avg score %.1f epsilon  %.2f' % (score, avg_score, agent.epsilon))

In [None]:
# plot learning curve
filename = 'cartpole_naive_dqn.png'
x = [i+1 for i in range(n_games)]
plot_learning_curve(x, scores, eps_history, filename)

### Deep Q Learning (DQN)

- https://arxiv.org/pdf/1312.5602.pdf
- https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
- https://pythonprogramming.net/deep-q-learning-dqn-reinforcement-learning-python-tutorial/

In [None]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [None]:
def plot(frame_idx, rewards, losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
    plt.plot(rewards)
    plt.subplot(132)
    plt.title('loss')
    plt.plot(losses)
    plt.show()

In [None]:
# Cart Pole Environment
env_id = "CartPole-v0"
env = gym.make(env_id)

In [None]:
# Epsilon greedy exploration (behavior policy)
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [None]:
plt.plot([epsilon_by_frame(i) for i in range(10000)])

In [None]:
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs)

In [None]:
## Deep Q-Network
class DQN(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(DQN, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, env.action_space.n)
        )
        
    def forward(self, x):
        return self.layers(x)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            #state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
            state   = Variable(torch.FloatTensor(state).unsqueeze(0))
            q_value = self.forward(state)
            #action  = q_value.max(1)[1].data[0]
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(env.action_space.n)
        return action

In [None]:
model = DQN(env.observation_space.shape[0], env.action_space.n)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
replay_buffer = ReplayBuffer(1000)

In [None]:
# Compute Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    
    state      = Variable(torch.FloatTensor(np.float32(state)))
    #next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    # DQN-learning
    q_values      = model(state)
    next_q_values = model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)

    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
# Training configuration
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = model.act(state, epsilon)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:

        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)

### Double Deep Q-Network

- https://arxiv.org/pdf/1509.06461.pdf

In [None]:
current_model = DQN(env.observation_space.shape[0], env.action_space.n)
target_model  = DQN(env.observation_space.shape[0], env.action_space.n)

In [None]:
optimizer = optim.Adam(current_model.parameters())

In [None]:
replay_buffer = ReplayBuffer(1000)

In [None]:
# Synchronize current evaluation policy network and target network
def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())

In [None]:
update_target(current_model, target_model)

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    # Double DQN-learning
    q_values      = current_model(state)
    next_q_values = current_model(next_state)
    next_q_state_values = target_model(next_state) 

    q_value       = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 
    next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
# Training confirguration
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 100 == 0:
        update_target(current_model, target_model)

### Dueling Deep Q Network

- https://arxiv.org/pdf/1511.06581.pdf

In [None]:
class DuelingDQN(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(DuelingDQN, self).__init__()
        
        
        self.feature = nn.Sequential(
            nn.Linear(num_inputs, 128),
            nn.ReLU()
        )
        
        self.advantage = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, num_outputs)
        )
        
        self.value = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        x = self.feature(x)
        advantage = self.advantage(x)
        value     = self.value(x)
        return value + advantage  - advantage.mean()
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
            q_value = self.forward(state)
            #action  = q_value.max(1)[1].data[0]
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(env.action_space.n)
        return action

In [None]:
current_model = DuelingDQN(env.observation_space.shape[0], env.action_space.n)
target_model  = DuelingDQN(env.observation_space.shape[0], env.action_space.n)

In [None]:
optimizer = optim.Adam(current_model.parameters())

In [None]:
replay_buffer = ReplayBuffer(1000)

In [None]:
# Synchronize current policy net and target net
update_target(current_model, target_model)

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))

    q_values      = current_model(state)
    next_q_values = target_model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss = (q_value - expected_q_value.detach()).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
# Training configuration
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 100 == 0:
        update_target(current_model, target_model)

### Prioritized Experience Replay
- https://arxiv.org/abs/1511.05952

In [None]:
class NaivePrioritizedBuffer(object):
    def __init__(self, capacity, prob_alpha=0.6):
        self.prob_alpha = prob_alpha
        self.capacity   = capacity
        self.buffer     = []
        self.pos        = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
    
    def push(self, state, action, reward, next_state, done):
        assert state.ndim == next_state.ndim
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        
        max_prio = self.priorities.max() if self.buffer else 1.0
        
        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)
        
        self.priorities[self.pos] = max_prio
        self.pos = (self.pos + 1) % self.capacity
    
    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]
        
        probs  = prios ** self.prob_alpha
        probs /= probs.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        samples = [self.buffer[idx] for idx in indices]
        
        total    = len(self.buffer)
        weights  = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        weights  = np.array(weights, dtype=np.float32)
        
        batch       = list(zip(*samples))
        states      = np.concatenate(batch[0])
        actions     = batch[1]
        rewards     = batch[2]
        next_states = np.concatenate(batch[3])
        dones       = batch[4]
        
        return states, actions, rewards, next_states, dones, indices, weights
    
    def update_priorities(self, batch_indices, batch_priorities):
        for idx, prio in list(zip(batch_indices, batch_priorities)):
            self.priorities[idx] = prio

    def __len__(self):
        return len(self.buffer)

In [None]:
beta_start = 0.4
beta_frames = 1000 
beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)

In [None]:
plt.plot([beta_by_frame(i) for i in range(10000)])

In [None]:
current_model = DQN(env.observation_space.shape[0], env.action_space.n)
target_model  = DQN(env.observation_space.shape[0], env.action_space.n)

In [None]:
optimizer = optim.Adam(current_model.parameters())

In [None]:
replay_buffer = NaivePrioritizedBuffer(100000)

In [None]:
# Synchronize current policy net and target net
update_target(current_model, target_model)

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size, beta):
    state, action, reward, next_state, done, indices, weights = replay_buffer.sample(batch_size, beta) 

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))
    weights    = Variable(torch.FloatTensor(weights))

    q_values      = current_model(state)
    next_q_values = target_model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss  = (q_value - expected_q_value.detach()).pow(2) * weights
    prios = loss + 1e-5
    loss  = loss.mean()
        
    optimizer.zero_grad()
    loss.backward()
    replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
    optimizer.step()
    
    return loss

In [None]:
# Training configuraiton
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = current_model.act(state, epsilon)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        beta = beta_by_frame(frame_idx)
        loss = compute_td_loss(batch_size, beta)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 1000 == 0:
        update_target(current_model, target_model)

### Noisy Networks for Exploration

- https://arxiv.org/abs/1706.10295

In [None]:
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        
        self.in_features  = in_features
        self.out_features = out_features
        self.std_init     = std_init
        
        self.weight_mu    = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
        
        self.bias_mu    = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
        
        self.reset_parameters()
        self.reset_noise()
    
    def forward(self, x):
        if self.training: 
            weight = self.weight_mu + self.weight_sigma.mul(Variable(self.weight_epsilon))
            bias   = self.bias_mu   + self.bias_sigma.mul(Variable(self.bias_epsilon))
        else:
            weight = self.weight_mu
            bias   = self.bias_mu
        
        return F.linear(x, weight, bias)
    
    def reset_parameters(self):
        mu_range = 1 / math.sqrt(self.weight_mu.size(1))
        
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
        
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
    
    def reset_noise(self):
        epsilon_in  = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(self._scale_noise(self.out_features))
    
    def _scale_noise(self, size):
        x = torch.randn(size)
        x = x.sign().mul(x.abs().sqrt())
        return x

In [None]:
# Noise DQN
class NoisyDQN(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(NoisyDQN, self).__init__()
        
        self.linear =  nn.Linear(env.observation_space.shape[0], 128)
        self.noisy1 = NoisyLinear(128, 128)
        self.noisy2 = NoisyLinear(128, env.action_space.n)
        
    def forward(self, x):
        x = F.relu(self.linear(x))
        x = F.relu(self.noisy1(x))
        x = self.noisy2(x)
        return x
    
    def act(self, state):
        state   = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        q_value = self.forward(state)
        #action  = q_value.max(1)[1].data[0]
        action = q_value.max(1)[1].item()
        return action
    
    def reset_noise(self):
        self.noisy1.reset_noise()
        self.noisy2.reset_noise()

In [None]:
beta_start = 0.4
beta_frames = 1000 
beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)

In [None]:
current_model = NoisyDQN(env.observation_space.shape[0], env.action_space.n)
target_model  = NoisyDQN(env.observation_space.shape[0], env.action_space.n)

In [None]:
optimizer = optim.Adam(current_model.parameters(), lr=0.0001)

In [None]:
replay_buffer = PrioritizedReplayBuffer(10000, alpha=0.6)

In [None]:
# Synchronize current policy net and target net
update_target(current_model, target_model)

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size, beta):
    state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta) 

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)))
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(np.float32(done)))
    weights    = Variable(torch.FloatTensor(weights))

    q_values      = current_model(state)
    next_q_values = target_model(next_state)

    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss  = (q_value - expected_q_value.detach()).pow(2) * weights
    prios = loss + 1e-5
    loss  = loss.mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
    current_model.reset_noise()
    target_model.reset_noise()
    
    return loss

In [None]:
# Training configuration
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    action = current_model.act(state)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        beta = beta_by_frame(frame_idx)
        loss = compute_td_loss(batch_size, beta)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 1000 == 0:
        update_target(current_model, target_model)

### Rainbow: Combining Improvements in Deep Reinforcement Learning

- https://arxiv.org/pdf/1710.02298.pdf

In [None]:
class RainbowDQN(nn.Module):
    def __init__(self, num_inputs, num_actions, num_atoms, Vmin, Vmax):
        super(RainbowDQN, self).__init__()
        
        self.num_inputs   = num_inputs
        self.num_actions  = num_actions
        self.num_atoms    = num_atoms
        self.Vmin         = Vmin
        self.Vmax         = Vmax
        
        self.linear1 = nn.Linear(num_inputs, 32)
        self.linear2 = nn.Linear(32, 64)
        
        self.noisy_value1 = NoisyLinear(64, 64, 
                                        #use_cuda=USE_CUDA
                                       )
        self.noisy_value2 = NoisyLinear(64, self.num_atoms, 
                                        #use_cuda=USE_CUDA
                                       )
        
        self.noisy_advantage1 = NoisyLinear(64, 64, 
                                            #use_cuda=USE_CUDA
                                           )
        self.noisy_advantage2 = NoisyLinear(64, self.num_atoms * self.num_actions, 
                                            #use_cuda=USE_CUDA
                                           )
        
    def forward(self, x):
        batch_size = x.size(0)
        
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        
        value = F.relu(self.noisy_value1(x))
        value = self.noisy_value2(value)
        
        advantage = F.relu(self.noisy_advantage1(x))
        advantage = self.noisy_advantage2(advantage)
        
        value     = value.view(batch_size, 1, self.num_atoms)
        advantage = advantage.view(batch_size, self.num_actions, self.num_atoms)
        
        x = value + advantage - advantage.mean(1, keepdim=True)
        x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms)
        
        return x
        
    def reset_noise(self):
        self.noisy_value1.reset_noise()
        self.noisy_value2.reset_noise()
        self.noisy_advantage1.reset_noise()
        self.noisy_advantage2.reset_noise()
    
    def act(self, state):
        #state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        state = Variable(torch.FloatTensor(state).unsqueeze(0))
        dist = self.forward(state).data.cpu()
        dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms)
        action = dist.sum(2).max(1)[1].numpy()[0]
        
        return action

In [None]:
num_atoms = 51
Vmin = -10
Vmax = 10

In [None]:
current_model = RainbowDQN(env.observation_space.shape[0], env.action_space.n, num_atoms, Vmin, Vmax)
target_model  = RainbowDQN(env.observation_space.shape[0], env.action_space.n, num_atoms, Vmin, Vmax)

In [None]:
optimizer = optim.Adam(current_model.parameters(), 0.001)

In [None]:
replay_buffer = ReplayBuffer(10000)

In [None]:
update_target(current_model, target_model)

In [None]:
def projection_distribution(next_state, rewards, dones):
    batch_size  = next_state.size(0)
    
    delta_z = float(Vmax - Vmin) / (num_atoms - 1)
    support = torch.linspace(Vmin, Vmax, num_atoms)
    
    next_dist   = target_model(next_state).data.cpu() * support
    next_action = next_dist.sum(2).max(1)[1]
    next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 
                                                               1, 
                                                               next_dist.size(2))
    next_dist   = next_dist.gather(1, next_action).squeeze(1)
        
    rewards = rewards.unsqueeze(1).expand_as(next_dist)
    dones   = dones.unsqueeze(1).expand_as(next_dist)
    support = support.unsqueeze(0).expand_as(next_dist)
    
    Tz = rewards + (1 - dones) * 0.99 * support
    Tz = Tz.clamp(min=Vmin, max=Vmax)
    b  = (Tz - Vmin) / delta_z
    l  = b.floor().long()
    u  = b.ceil().long()
        
    offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\
                    .unsqueeze(1).expand(batch_size, num_atoms)

    proj_dist = torch.zeros(next_dist.size())    
    proj_dist.view(-1).index_add_(0, (l + offset).view(-1), 
                                  (next_dist * (u.float() - b)).view(-1))
    proj_dist.view(-1).index_add_(0, (u + offset).view(-1), 
                                  (next_dist * (b - l.float())).view(-1))
        
    return proj_dist

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size) 

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = torch.FloatTensor(reward)
    done       = torch.FloatTensor(np.float32(done))

    proj_dist = projection_distribution(next_state, reward, done)
    
    dist = current_model(state)
    action = action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_atoms)
    dist = dist.gather(1, action).squeeze(1)
    dist.data.clamp_(0.01, 0.99)
    loss = -(Variable(proj_dist) * dist.log()).sum(1)
    loss  = loss.mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    current_model.reset_noise()
    target_model.reset_noise()
    
    return loss

In [None]:
# Training confirguration
num_frames = 15000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    action = current_model.act(state)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 1000 == 0:
        update_target(current_model, target_model)

### Noisy Networks for Exploration

- https://arxiv.org/pdf/1706.10295.pdf

In [None]:
class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        
        self.in_features  = in_features
        self.out_features = out_features
        self.std_init     = std_init
        
        self.weight_mu    = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
        
        self.bias_mu    = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
        
        self.reset_parameters()
        self.reset_noise()
    
    def forward(self, x):
        if self.training: 
            weight = self.weight_mu + self.weight_sigma.mul(Variable(self.weight_epsilon))
            bias   = self.bias_mu   + self.bias_sigma.mul(Variable(self.bias_epsilon))
        else:
            weight = self.weight_mu
            bias   = self.bias_mu
        
        return F.linear(x, weight, bias)
    
    def reset_parameters(self):
        mu_range = 1 / math.sqrt(self.weight_mu.size(1))
        
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
        
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
    
    def reset_noise(self):
        epsilon_in  = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(self._scale_noise(self.out_features))
    
    def _scale_noise(self, size):
        x = torch.randn(size)
        x = x.sign().mul(x.abs().sqrt())
        return x

### C-51 Algorithm

A Distributional Perspective on Reinforcement Learning.

- https://arxiv.org/pdf/1707.06887.pdf

In [None]:
class CategoricalDQN(nn.Module):
    def __init__(self, num_inputs, num_actions, num_atoms, Vmin, Vmax):
        super(CategoricalDQN, self).__init__()
        
        self.num_inputs = num_inputs
        self.num_actions  = num_actions
        self.num_atoms    = num_atoms
        self.Vmin         = Vmin
        self.Vmax         = Vmax
        
        self.linear1 = nn.Linear(num_inputs, 128)
        self.linear2 = nn.Linear(128, 128)
        self.noisy1 = NoisyLinear(128, 512)
        self.noisy2 = NoisyLinear(512, self.num_actions * self.num_atoms)
        
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.noisy1(x))
        x = self.noisy2(x)
        x = F.softmax(x.view(-1, self.num_atoms)).view(-1, self.num_actions, self.num_atoms)
        return x
        
    def reset_noise(self):
        self.noisy1.reset_noise()
        self.noisy2.reset_noise()
    
    def act(self, state):
        state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        dist = self.forward(state).data.cpu()
        dist = dist * torch.linspace(Vmin, Vmax, num_atoms)
        action = dist.sum(2).max(1)[1].numpy()[0]
        return action

In [None]:
def projection_distribution(next_state, rewards, dones):
    batch_size  = next_state.size(0)
    
    delta_z = float(Vmax - Vmin) / (num_atoms - 1)
    support = torch.linspace(Vmin, Vmax, num_atoms)
    
    next_dist   = target_model(next_state).data.cpu() * support
    next_action = next_dist.sum(2).max(1)[1]
    next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2))
    next_dist   = next_dist.gather(1, next_action).squeeze(1)
        
    rewards = rewards.unsqueeze(1).expand_as(next_dist)
    dones   = dones.unsqueeze(1).expand_as(next_dist)
    support = support.unsqueeze(0).expand_as(next_dist)
    
    Tz = rewards + (1 - dones) * 0.99 * support
    Tz = Tz.clamp(min=Vmin, max=Vmax)
    b  = (Tz - Vmin) / delta_z
    l  = b.floor().long()
    u  = b.ceil().long()
        
    offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\
                    .unsqueeze(1).expand(batch_size, num_atoms)

    proj_dist = torch.zeros(next_dist.size())    
    proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
    proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))
        
    return proj_dist

In [None]:
num_atoms = 51
Vmin = -10
Vmax = 10

In [None]:
current_model = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, num_atoms, Vmin, Vmax)
target_model  = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, num_atoms, Vmin, Vmax)

In [None]:
optimizer = optim.Adam(current_model.parameters())

In [None]:
replay_buffer = ReplayBuffer(10000)

In [None]:
update_target(current_model, target_model)

In [None]:
# Computing Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size) 

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = torch.FloatTensor(reward)
    done       = torch.FloatTensor(np.float32(done))

    proj_dist = projection_distribution(next_state, reward, done)
    
    dist = current_model(state)
    action = action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_atoms)
    dist = dist.gather(1, action).squeeze(1)
    dist.data.clamp_(0.01, 0.99)
    loss = - (Variable(proj_dist) * dist.log()).sum(1).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    current_model.reset_noise()
    target_model.reset_noise()
    
    return loss

In [None]:
# Training configuration
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
state = env.reset()
for frame_idx in range(1, num_frames + 1):
    action = current_model.act(state)
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 100 == 0:
        update_target(current_model, target_model)

### Hierarchical DQN

- https://arxiv.org/pdf/1604.06057.pdf

In [None]:
class StochasticMDP:
    def __init__(self):
        self.end           = False
        self.current_state = 2
        self.num_actions   = 2
        self.num_states    = 6
        self.p_right       = 0.5

    def reset(self):
        self.end = False
        self.current_state = 2
        state = np.zeros(self.num_states)
        state[self.current_state - 1] = 1.
        return state

    def step(self, action):
        if self.current_state != 1:
            if action == 1:
                if random.random() < self.p_right and self.current_state < self.num_states:
                    self.current_state += 1
                else:
                    self.current_state -= 1
                    
            if action == 0:
                self.current_state -= 1
                
            if self.current_state == self.num_states:
                self.end = True
        
        state = np.zeros(self.num_states)
        state[self.current_state - 1] = 1.
        
        if self.current_state == 1:
            if self.end:
                return state, 1.00, True, {}
            else:
                return state, 1.00/100.00, True, {}
        else:
            return state, 0.0, False, {}

In [None]:
class Net(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Net, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(num_inputs, 256),
            nn.ReLU(),
            nn.Linear(256, num_outputs)
        )
    
    def forward(self, x):
        return self.layers(x)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state  = torch.FloatTensor(state).unsqueeze(0)
            action = self.forward(Variable(state, volatile=True)).max(1)[1]
            return action.data[0]
        else:
            return random.randrange(num_actions)

In [None]:
env = StochasticMDP()

In [None]:
num_goals    = env.num_states
num_actions  = env.num_actions

In [None]:
model        = Net(2*num_goals, num_actions)
target_model = Net(2*num_goals, num_actions)

In [None]:
meta_model        = Net(num_goals, num_goals)
target_meta_model = Net(num_goals, num_goals)

In [None]:
optimizer      = optim.Adam(model.parameters())

In [None]:
meta_optimizer = optim.Adam(meta_model.parameters())

In [None]:
replay_buffer      = ReplayBuffer(10000)
meta_replay_buffer = ReplayBuffer(10000)

In [None]:
def to_onehot(x):
    oh = np.zeros(6)
    oh[x - 1] = 1.
    return oh

In [None]:
def update(model, optimizer, replay_buffer, batch_size):
    if batch_size > len(replay_buffer):
        return
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    
    state      = Variable(torch.FloatTensor(state))
    next_state = Variable(torch.FloatTensor(next_state), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = Variable(torch.FloatTensor(reward))
    done       = Variable(torch.FloatTensor(done))
    
    q_value = model(state)
    q_value = q_value.gather(1, action.unsqueeze(1)).squeeze(1)
    
    next_q_value     = model(next_state).max(1)[0]
    expected_q_value = reward + 0.99 * next_q_value * (1 - done)
   
    loss = (q_value - Variable(expected_q_value.data)).pow(2).mean()
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [None]:
# Training configuration
num_frames = 100000
frame_idx  = 1

state = env.reset()
done = False
all_rewards = []
episode_reward = 0

In [None]:
while frame_idx < num_frames:
    goal = meta_model.act(state, epsilon_by_frame(frame_idx))
    onehot_goal  = to_onehot(goal)
    
    meta_state = state
    extrinsic_reward = 0
    
    while not done and goal != np.argmax(state):
        goal_state  = np.concatenate([state, onehot_goal])
        action = model.act(goal_state, epsilon_by_frame(frame_idx))
        next_state, reward, done, _ = env.step(action)

        episode_reward   += reward
        extrinsic_reward += reward
        intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0

        replay_buffer.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, onehot_goal]), done)
        state = next_state
        
        update(model, optimizer, replay_buffer, 32)
        update(meta_model, meta_optimizer, meta_replay_buffer, 32)
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            clear_output(True)
            n = 100 #mean reward of last 100 episodes
            plt.figure(figsize=(20,5))
            plt.title(frame_idx)
            plt.plot([np.mean(all_rewards[i:i + n]) for i in range(0, len(all_rewards), n)])
            plt.show()

    meta_replay_buffer.push(meta_state, goal, extrinsic_reward, state, done)
        
    if done:
        state = env.reset()
        done  = False
        all_rewards.append(episode_reward)
        episode_reward = 0

### Distributional Reinforcement Learning with Quantile Regression

- https://arxiv.org/pdf/1710.10044.pdf

In [None]:
env_id = "CartPole-v0"
env = gym.make(env_id)

In [None]:
obs = env.reset() # random initialization of environment

In [None]:
class QRDQN(nn.Module):
    def __init__(self, num_inputs, num_actions, num_quants):
        super(QRDQN, self).__init__()
        
        self.num_inputs  = num_inputs
        self.num_actions = num_actions
        self.num_quants  = num_quants
        
        self.features = nn.Sequential(
            nn.Linear(num_inputs, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, self.num_actions * self.num_quants)
        )
        
        #self.noisy_value1 = NoisyLinear(64, 128, use_cuda=USE_CUDA)
        #self.noisy_value2 = NoisyLinear(128, self.num_actions * self.num_quants, use_cuda=USE_CUDA)
        
    def forward(self, x):
        batch_size = x.size(0)

        x = self.features(x)
        
        #x = self.noisy_value1(x)
        #x = F.relu(x)
        #x = self.noisy_value2(x)
        x = x.view(batch_size, self.num_actions, self.num_quants)
        
        return x
    
    def q_values(self, x):
        x = self.forward(x)
        return x.mean(2)
    
    def reset_noise(self):
        self.noisy_value1.reset_noise()
        self.noisy_value2.reset_noise() 
        
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = Variable(torch.FloatTensor(np.array(state, dtype=np.float32)).unsqueeze(0), 
                             volatile=True)
            qvalues = self.forward(state).mean(2)
            action  = qvalues.max(1)[1]
            action  = action.data.cpu().numpy()[0]
        else:
            action = random.randrange(self.num_actions)
        return action

In [None]:
def projection_distribution(dist, next_state, reward, done):
    next_dist = target_model(next_state)
    next_action = next_dist.mean(2).max(1)[1]
    next_action = next_action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_quant)
    next_dist = next_dist.gather(1, next_action).squeeze(1).cpu().data

    expected_quant = reward.unsqueeze(1) + 0.99 * next_dist * (1 - done.unsqueeze(1))
    expected_quant = Variable(expected_quant)

    quant_idx = torch.sort(dist, 1, descending=False)[1]

    tau_hat = torch.linspace(0.0, 1.0 - 1./num_quant, num_quant) + 0.5 / num_quant
    tau_hat = tau_hat.unsqueeze(0).repeat(batch_size, 1)
    quant_idx = quant_idx.cpu().data
    batch_idx = np.arange(batch_size)
    tau = tau_hat[:, quant_idx][batch_idx, batch_idx]
        
    return tau, expected_quant

In [None]:
num_quant = 51
Vmin = -10
Vmax = 10

In [None]:
current_model = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)
target_model  = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)

In [None]:
optimizer = optim.Adam(current_model.parameters())

In [None]:
replay_buffer = ReplayBuffer(10000)

In [None]:
update_target(current_model, target_model)

In [None]:
## Computing Temporal Difference Loss
def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size) 

    state      = Variable(torch.FloatTensor(np.float32(state)))
    next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = Variable(torch.LongTensor(action))
    reward     = torch.FloatTensor(reward)
    done       = torch.FloatTensor(np.float32(done))

    dist = current_model(state)
    action = action.unsqueeze(1).unsqueeze(1).expand(batch_size, 1, num_quant)
    dist = dist.gather(1, action).squeeze(1)
    
    tau, expected_quant = projection_distribution(dist, next_state, reward, done)
    k = 1
    
    u=expected_quant-dist
    huber_loss = 0.5 * u.abs().clamp(min=0.0, max=k).pow(2)
    huber_loss += k * (u.abs() -  u.abs().clamp(min=0.0, max=k))
    quantile_loss = (tau - (u < 0).float()).abs() * huber_loss
    loss = quantile_loss.sum() / num_quant
        
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm(current_model.parameters(), 0.5)
    optimizer.step()
    
    return loss

In [None]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [None]:
# Training configuration
num_frames = 10000
batch_size = 32
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0

In [None]:
for frame_idx in range(1, num_frames + 1):
    action = current_model.act(state, epsilon_by_frame(frame_idx))
    
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size)
        #losses.append(loss.data[0])
        losses.append(loss.item())
        
    if frame_idx % 200 == 0:
        plot(frame_idx, all_rewards, losses)
        
    if frame_idx % 1000 == 0:
        update_target(current_model, target_model)

### Actor-Critic - Synchronous Advantage Actor Critic (A3C)

The algorithm combines a few key ideas:

- An updating scheme that operates on fixed-length segments of experience (say, 20 timesteps) and uses these segments to compute estimators of the returns and advantage function.
- Architectures that share layers between the policy and value function.
- Asynchronous updates.

https://www.youtube.com/watch?v=3gboWbqaP5A

https://www.youtube.com/watch?v=G0L8SN02clA

In [None]:
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [None]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

In [None]:
# use cuda
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

In [None]:
num_envs = 16
env_name = "CartPole-v0"

In [None]:
def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

In [None]:
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

In [None]:
env = gym.make(env_name)

In [None]:
# Neural Network
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
            nn.Softmax(dim=1),
        )
        
    def forward(self, x):
        value = self.critic(x)
        probs = self.actor(x)
        dist  = Categorical(probs)
        return dist, value

In [None]:
def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
        
    return returns

In [None]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.n

In [None]:
# Set hyper params
hidden_size = 256
lr          = 3e-4
num_steps   = 5

In [None]:
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
max_frames   = 20000
frame_idx    = 0
test_rewards = []

In [None]:
state = envs.reset()
while frame_idx < max_frames:

    log_probs = []
    values    = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_rewards.append(np.mean([test_env() for _ in range(10)]))
            plot(frame_idx, test_rewards)
            
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_returns(next_value, rewards, masks)
    
    log_probs = torch.cat(log_probs)
    returns   = torch.cat(returns).detach()
    values    = torch.cat(values)

    advantage = returns - values

    actor_loss  = -(log_probs * advantage.detach()).mean()
    critic_loss = advantage.pow(2).mean()

    loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
test_env(True)

### Generalized Advantage Estimation (GAE) - High-Dimensional Continuous Control

- https://arxiv.org/pdf/1506.02438.pdf

In [None]:
num_envs = 16
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

In [None]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])

    return returns

In [None]:
# Configure input dimension
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

# Set typer params
hidden_size = 256
lr          = 3e-2
num_steps   = 20

In [None]:
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
# Training configuration
max_frames   = 100000
frame_idx    = 0
test_rewards = []

In [None]:
state = envs.reset()
while frame_idx < max_frames:

    log_probs = []
    values    = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_rewards.append(np.mean([test_env() for _ in range(10)]))
            plot(frame_idx, test_rewards)
            
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)
    
    log_probs = torch.cat(log_probs)
    returns   = torch.cat(returns).detach()
    values    = torch.cat(values)

    advantage = returns - values

    actor_loss  = -(log_probs * advantage.detach()).mean()
    critic_loss = advantage.pow(2).mean()

    loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
test_env(True)

### Proximal Policy Optimization (PPO) Algorithm

- https://arxiv.org/abs/1707.06347

In [None]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [None]:
num_envs = 16
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist, value

In [None]:
# Generalized Advantage Estimation (GAE)d
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
        
    return returns

In [None]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    
    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

In [None]:
def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
    for _ in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            dist, value = model(state)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [None]:
#obs = env.reset() # random initialization of environment

In [None]:
# Configure input dimension
num_inputs  = envs.observation_space.shape[0]
num_outputs = env.action_space.shape[0]

# Set hyper params
hidden_size      = 256
lr               = 3e-4
num_steps        = 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200

In [None]:
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
# Training configuration
max_frames = 15000
frame_idx  = 0
test_rewards = []

In [None]:
state = envs.reset()
early_stop = False

while frame_idx < max_frames and not early_stop:

    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())

        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        states.append(state)
        actions.append(action)
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)]) 
            test_rewards.append(test_reward)
            plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
            

    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)

    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    
    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)

### Actor-Critic with Experience Replay (ACER) 

Sample Efficient Actor-Critic with Experience Replay (ACER) combines several ideas of previous algorithms: it uses multiple workers (as A2C), implements a replay buffer (as in DQN), uses Retrace for Q-value estimation, importance sampling and a trust region.

- https://arxiv.org/pdf/1611.01224.pdf
- https://stable-baselines.readthedocs.io/en/master/modules/acer.html

In [None]:
# Episodic Replay Buffer
class EpisodicReplayMemory(object):
    def __init__(self, capacity, max_episode_length):
        self.num_episodes = capacity // max_episode_length
        self.buffer = deque(maxlen=self.num_episodes)
        self.buffer.append([])
        self.position = 0
        
    def push(self, state, action, reward, policy, mask, done):
        self.buffer[self.position].append((state, action, reward, policy, mask))
        if done:
            self.buffer.append([])
            self.position = min(self.position + 1, self.num_episodes - 1)
            
    def sample(self, batch_size, max_len=None):
        min_len = 0
        while min_len == 0:
            rand_episodes = random.sample(self.buffer, batch_size)
            min_len = min(len(episode) for episode in rand_episodes)
            
        if max_len:
            max_len = min(max_len, min_len)
        else:
            max_len = min_len
            
        episodes = []
        for episode in rand_episodes:
            if len(episode) > max_len:
                rand_idx = random.randint(0, len(episode) - max_len)
            else:
                rand_idx = 0

            episodes.append(episode[rand_idx:rand_idx+max_len])
            
        return list(map(list, zip(*episodes)))
    
    def __len__(self):
        return len(self.buffer)

In [None]:
# Neural Network
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(ActorCritic, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, num_actions),
            nn.Softmax(dim=1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, num_actions)
        )
        
        
    def forward(self, x):
        policy  = self.actor(x).clamp(max=1-1e-20)
        q_value = self.critic(x)
        value   = (policy * q_value).sum(-1, keepdim=True)
        return policy, q_value, value

In [None]:
def test_env(render=False):
    state = env.reset()
    done = False
    total_reward = 0
    if render: 
        env.render()
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        policy, _, _ = model(state)
        action = policy.multinomial(1)
        next_state, reward, done, _ = env.step(action.item())
        state = next_state
        total_reward += reward
        if render: 
            env.render()
    return total_reward

def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.subplot(132)
    plt.show()

In [None]:
# Sample Efficient Actor-Critic with Experience Replay
def compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, behavior_policies, gamma=0.99, truncation_clip=10, entropy_weight=0.0001):
    loss = 0
    
    for step in reversed(range(len(rewards))):
        importance_weight = policies[step].detach() / behavior_policies[step].detach()

        retrace = rewards[step] + gamma * retrace * masks[step]
        advantage = retrace - values[step]

        log_policy_action = policies[step].gather(1, actions[step]).log()
        truncated_importance_weight = importance_weight.gather(1, actions[step]).clamp(max=truncation_clip)
        actor_loss = -(truncated_importance_weight * log_policy_action * advantage.detach()).mean(0)

        correction_weight = (1 - truncation_clip / importance_weight).clamp(min=0)
        actor_loss -= (correction_weight * policies[step].log() * (q_values[step] - values[step]).detach()).sum(1).mean(0)
        
        entropy = entropy_weight * -(policies[step].log() * policies[step]).sum(1).mean(0)

        q_value = q_values[step].gather(1, actions[step])
        critic_loss = ((retrace - q_value) ** 2 / 2).mean(0)

        truncated_rho = importance_weight.gather(1, actions[step]).clamp(max=1)
        retrace = truncated_rho * (retrace - q_value.detach()) + values[step].detach()
        
        loss += actor_loss + critic_loss - entropy
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
def off_policy_update(batch_size, replay_ratio=4):
    if batch_size > len(replay_buffer) + 1:
        return
    
    for _ in range(np.random.poisson(replay_ratio)):
        trajs = replay_buffer.sample(batch_size)
        state, action, reward, old_policy, mask = map(torch.stack, zip(*(map(torch.cat, zip(*traj)) for traj in trajs)))

        q_values = []
        values   = []
        policies = []

        for step in range(state.size(0)):
            policy, q_value, value = model(state[step])
            q_values.append(q_value)
            policies.append(policy)
            values.append(value)

        _, _, retrace = model(state[-1])
        retrace = retrace.detach()
        compute_acer_loss(policies, q_values, values, action, reward, retrace, mask, old_policy)

In [None]:
env = gym.make("CartPole-v0")

In [None]:
model = ActorCritic(env.observation_space.shape[0], env.action_space.n).to(device)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
capacity = 1000000
max_episode_length = 200
replay_buffer = EpisodicReplayMemory(capacity, max_episode_length)

In [None]:
# Training configuration
frame_idx    = 0
max_frames   = 10000
num_steps    = 5
log_interval = 100
test_rewards = []

In [None]:
state = env.reset()
while frame_idx < max_frames:
    
    q_values = []
    values   = []
    policies = []
    actions  = []
    rewards  = []
    masks    = []
    
    for step in range(num_steps):
    
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        policy, q_value, value = model(state)
        
        action = policy.multinomial(1)
        next_state, reward, done, _ = env.step(action.item())
        
        reward = torch.FloatTensor([reward]).unsqueeze(1).to(device)
        mask   = torch.FloatTensor(1 - np.float32([done])).unsqueeze(1).to(device)
        replay_buffer.push(state.detach(), action, reward, policy.detach(), mask, done)

        q_values.append(q_value)
        policies.append(policy)
        actions.append(action)
        rewards.append(reward)
        values.append(value)
        masks.append(mask)
        
        state = next_state
        if done:
            state = env.reset()
    
    next_state = torch.FloatTensor(state).unsqueeze(0).to(device)
    _, _, retrace = model(next_state)
    retrace = retrace.detach()
    compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, policies)
    
    off_policy_update(128)
    
    if frame_idx % log_interval == 0:
        test_rewards.append(np.mean([test_env() for _ in range(5)]))
        plot(frame_idx, test_rewards)
        
    frame_idx += num_steps


In [None]:
test_env(True)

### Deep Deterministic Policy Gradient (DDPG)

Deep Deterministic Policy Gradient (DDPG) is an algorithm which concurrently learns a Q-function and a policy. It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy.

- https://arxiv.org/pdf/1509.02971.pdf
- https://spinningup.openai.com/en/latest/algorithms/ddpg.html
- https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#the-optimal-q-function-and-the-optimal-action

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [None]:
# Normalize action space
class NormalizedActions(gym.ActionWrapper):

    def action(self, action):
        low_bound   = self.action_space.low
        upper_bound = self.action_space.high
        
        action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
        action = np.clip(action, low_bound, upper_bound)
        
        return action

    def reverse_action(self, action):
        low_bound   = self.action_space.low
        upper_bound = self.action_space.high
        
        action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
        action = np.clip(action, low_bound, upper_bound)
        
        return actions

In [None]:
# Ornstein-Uhlenbeck process - Adding time-correlated noise to the actions taken by the deterministic policy
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)
    
#https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py

In [None]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

In [None]:
# Continuous control with deep reinforcement learning
class ValueNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
    

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(PolicyNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = F.tanh(self.linear3(x))
        return x
    
    def get_action(self, state):
        state  = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.forward(state)
        return action.detach().cpu().numpy()[0, 0]

In [None]:
# DDPG Update
def ddpg_update(batch_size, 
           gamma = 0.99,
           min_value=-np.inf,
           max_value=np.inf,
           soft_tau=1e-2):
    
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    
    state      = torch.FloatTensor(state).to(device)
    next_state = torch.FloatTensor(next_state).to(device)
    action     = torch.FloatTensor(action).to(device)
    reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)
    done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

    policy_loss = value_net(state, policy_net(state))
    policy_loss = -policy_loss.mean()

    next_action    = target_policy_net(next_state)
    target_value   = target_value_net(next_state, next_action.detach())
    expected_value = reward + (1.0 - done) * gamma * target_value
    expected_value = torch.clamp(expected_value, min_value, max_value)

    value = value_net(state, action)
    value_loss = value_criterion(value, expected_value.detach())


    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

    for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

In [None]:
env = NormalizedActions(gym.make("Pendulum-v0"))

In [None]:
ou_noise = OUNoise(env.action_space)

In [None]:
state_dim  = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [None]:
hidden_dim = 256
value_net  = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

In [None]:
target_value_net  = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

In [None]:
for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
    target_param.data.copy_(param.data)

In [None]:
value_lr  = 1e-3
policy_lr = 1e-4

value_optimizer  = optim.Adam(value_net.parameters(),  lr=value_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)

In [None]:
value_criterion = nn.MSELoss()

In [None]:
replay_buffer_size = 1000000
replay_buffer = ReplayBuffer(replay_buffer_size)

In [None]:
max_frames  = 12000
max_steps   = 500
frame_idx   = 0
rewards     = []
batch_size  = 128

In [None]:
while frame_idx < max_frames:
    state = env.reset()
    ou_noise.reset()
    episode_reward = 0
    
    for step in range(max_steps):
        action = policy_net.get_action(state)
        action = ou_noise.get_action(action, step)
        next_state, reward, done, _ = env.step(action)
        
        replay_buffer.push(state, action, reward, next_state, done)
        if len(replay_buffer) > batch_size:
            ddpg_update(batch_size)
        
        state = next_state
        episode_reward += reward
        frame_idx += 1
        
        if frame_idx % max(1000, max_steps + 1) == 0:
            plot(frame_idx, rewards)
        
        if done:
            break
    
    rewards.append(episode_reward)

### Twin Delayed DDPG (TD3)

Twin Delayed DDPG (TD3) is an algorithm which addresses this issue by introducing three critical tricks:

Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.

Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function. The paper recommends one policy update for every two Q-function updates.

Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action.

Together, these three tricks result in substantially improved performance over baseline DDPG.

- https://arxiv.org/pdf/1802.09477.pdf
- https://spinningup.openai.com/en/latest/algorithms/td3.html

In [None]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")

In [None]:
# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [None]:
class NormalizedActions(gym.ActionWrapper):
    def action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = low + (action + 1.0) * 0.5 * (high - low)
        action = np.clip(action, low, high)
        
        return action

    def reverse_action(self, action):
        low  = self.action_space.low
        high = self.action_space.high
        
        action = 2 * (action - low) / (high - low) - 1
        action = np.clip(action, low, high)
        
        return actions

In [None]:
## Adding Gaussian Noise
class GaussianExploration(object):
    def __init__(self, action_space, max_sigma=1.0, min_sigma=1.0, decay_period=1000000):
        self.low  = action_space.low
        self.high = action_space.high
        self.max_sigma = max_sigma
        self.min_sigma = min_sigma
        self.decay_period = decay_period
    
    def get_action(self, action, t=0):
        sigma  = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        action = action + np.random.normal(size=len(action)) * sigma
        return np.clip(action, self.low, self.high)
    
#https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/gaussian_strategy.py

In [None]:
def soft_update(net, target_net, soft_tau=1e-2):
    for target_param, param in zip(target_net.parameters(), net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )
            
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()

In [None]:
# Addressing Function Approximation Error in Actor-Critic Methods
class ValueNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
    

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(PolicyNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = F.tanh(self.linear3(x))
        return x
    
    def get_action(self, state):
        state  = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = self.forward(state)
        return action.detach().cpu().numpy()[0]

In [None]:
# Twin Dueling DDPG Update
def td3_update(step,
           batch_size,
           gamma = 0.99,
           soft_tau=1e-2,
           noise_std = 0.2,
           noise_clip=0.5,
           policy_update=2,
          ):

    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = torch.FloatTensor(state).to(device)
    next_state = torch.FloatTensor(next_state).to(device)
    action     = torch.FloatTensor(action).to(device)
    reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)
    done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

    next_action = target_policy_net(next_state)
    noise = torch.normal(torch.zeros(next_action.size()), noise_std).to(device)
    noise = torch.clamp(noise, -noise_clip, noise_clip)
    next_action += noise

    target_q_value1  = target_value_net1(next_state, next_action)
    target_q_value2  = target_value_net2(next_state, next_action)
    target_q_value   = torch.min(target_q_value1, target_q_value2)
    expected_q_value = reward + (1.0 - done) * gamma * target_q_value

    q_value1 = value_net1(state, action)
    q_value2 = value_net2(state, action)

    value_loss1 = value_criterion(q_value1, expected_q_value.detach())
    value_loss2 = value_criterion(q_value2, expected_q_value.detach())

    value_optimizer1.zero_grad()
    value_loss1.backward()
    value_optimizer1.step()

    value_optimizer2.zero_grad()
    value_loss2.backward()
    value_optimizer2.step()

    if step % policy_update == 0:
        policy_loss = value_net1(state, policy_net(state))
        policy_loss = -policy_loss.mean()

        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()

        soft_update(value_net1, target_value_net1, soft_tau=soft_tau)
        soft_update(value_net2, target_value_net2, soft_tau=soft_tau)
        soft_update(policy_net, target_policy_net, soft_tau=soft_tau)

In [None]:
env = NormalizedActions(gym.make('Pendulum-v0'))

In [None]:
noise = GaussianExploration(env.action_space)

In [None]:
state_dim  = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

In [None]:
hidden_dim = 256
value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

In [None]:
target_value_net1 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_value_net2 = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

In [None]:
soft_update(value_net1, target_value_net1, soft_tau=1.0)
soft_update(value_net2, target_value_net2, soft_tau=1.0)
soft_update(policy_net, target_policy_net, soft_tau=1.0)

In [None]:
value_criterion = nn.MSELoss()

In [None]:
policy_lr = 1e-3
value_lr  = 1e-3

value_optimizer1 = optim.Adam(value_net1.parameters(), lr=value_lr)
value_optimizer2 = optim.Adam(value_net2.parameters(), lr=value_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)

In [None]:
replay_buffer_size = 1000000
replay_buffer = ReplayBuffer(replay_buffer_size)

In [None]:
max_frames  = 10000
max_steps   = 500
frame_idx   = 0
rewards     = []
batch_size  = 128

In [None]:
while frame_idx < max_frames:
    state = env.reset()
    episode_reward = 0
    
    for step in range(max_steps):
        action = policy_net.get_action(state)
        action = noise.get_action(action, step)

        next_state, reward, done, _ = env.step(action)
        
        replay_buffer.push(state, action, reward, next_state, done)
        if len(replay_buffer) > batch_size:
            td3_update(step, batch_size)
        
        state = next_state
        episode_reward += reward
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            plot(frame_idx, rewards)
        
        if done:
            break
        
    rewards.append(episode_reward)

## Soft Actor Critic (SAC) 

Soft Actor Critic, is an off-policy maximum entropy algorithm with a Stochastic Actor, is an algorithm which optimizes a stochastic policy in an off-policy way, forming a bridge between stochastic policy optimization and DDPG-style approaches. This approach incorporates the clipped double-Q trick, and due to the inherent stochasticity of the policy in SAC, it also winds up benefiting from something like target policy smoothing.

A central feature of SAC is entropy regularization. The policy is trained to maximize a trade-off between expected return and entropy, a measure of randomness in the policy. This has a close connection to the exploration-exploitation trade-off: increasing entropy results in more exploration, which can accelerate learning later on. It can also prevent the policy from prematurely converging to a bad local optimum.

- https://arxiv.org/pdf/1802.09477.pdf
- https://arxiv.org/pdf/1801.01290.pdf
- https://spinningup.openai.com/en/latest/algorithms/sac.html

In [None]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(state_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class SoftQNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(SoftQNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_min=-20, log_std_max=2):
        super(PolicyNetwork, self).__init__()
        
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        
        self.mean_linear = nn.Linear(hidden_size, num_actions)
        self.mean_linear.weight.data.uniform_(-init_w, init_w)
        self.mean_linear.bias.data.uniform_(-init_w, init_w)
        
        self.log_std_linear = nn.Linear(hidden_size, num_actions)
        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
        self.log_std_linear.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        
        mean    = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        
        return mean, log_std
    
    def evaluate(self, state, epsilon=1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        
        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        
        return action, log_prob, z, mean, log_std
        
    
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(mean, std)
        z      = normal.sample()
        action = torch.tanh(z)
        
        action  = action.detach().cpu().numpy()
        return action[0]

In [None]:
def soft_q_update(batch_size, 
           gamma=0.99,
           mean_lambda=1e-3,
           std_lambda=1e-3,
           z_lambda=0.0,
           soft_tau=1e-2,
          ):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = torch.FloatTensor(state).to(device)
    next_state = torch.FloatTensor(next_state).to(device)
    action     = torch.FloatTensor(action).to(device)
    reward     = torch.FloatTensor(reward).unsqueeze(1).to(device)
    done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)

    expected_q_value = soft_q_net(state, action)
    expected_value   = value_net(state)
    new_action, log_prob, z, mean, log_std = policy_net.evaluate(state)


    target_value = target_value_net(next_state)
    next_q_value = reward + (1 - done) * gamma * target_value
    q_value_loss = soft_q_criterion(expected_q_value, next_q_value.detach())

    expected_new_q_value = soft_q_net(state, new_action)
    next_value = expected_new_q_value - log_prob
    value_loss = value_criterion(expected_value, next_value.detach())

    log_prob_target = expected_new_q_value - expected_value
    policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
    

    mean_loss = mean_lambda * mean.pow(2).mean()
    std_loss  = std_lambda  * log_std.pow(2).mean()
    z_loss    = z_lambda    * z.pow(2).sum(1).mean()

    policy_loss += mean_loss + std_loss + z_loss

    soft_q_optimizer.zero_grad()
    q_value_loss.backward()
    soft_q_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    
    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - soft_tau) + param.data * soft_tau
        )

In [None]:
env = NormalizedActions(gym.make("Pendulum-v0"))

In [None]:
action_dim = env.action_space.shape[0]
state_dim  = env.observation_space.shape[0]

In [None]:
hidden_dim = 256

value_net        = ValueNetwork(state_dim, hidden_dim).to(device)
target_value_net = ValueNetwork(state_dim, hidden_dim).to(device)

In [None]:
soft_q_net = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

In [None]:
for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

In [None]:
value_criterion  = nn.MSELoss()
soft_q_criterion = nn.MSELoss()

In [None]:
value_lr  = 3e-4
soft_q_lr = 3e-4
policy_lr = 3e-4

value_optimizer  = optim.Adam(value_net.parameters(), lr=value_lr)
soft_q_optimizer = optim.Adam(soft_q_net.parameters(), lr=soft_q_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)

In [None]:
replay_buffer_size = 1000000
replay_buffer = ReplayBuffer(replay_buffer_size)

In [None]:
max_frames  = 40000
max_steps   = 500
frame_idx   = 0
rewards     = []
batch_size  = 128

In [None]:
max_frames  = 40000

In [None]:
while frame_idx < max_frames:
    state = env.reset()
    episode_reward = 0
    
    for step in range(max_steps):
        action = policy_net.get_action(state)
        next_state, reward, done, _ = env.step(action)
        
        replay_buffer.push(state, action, reward, next_state, done)
        if len(replay_buffer) > batch_size:
            soft_q_update(batch_size)
        
        state = next_state
        episode_reward += reward
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            plot(frame_idx, rewards)
        
        if done:
            break
        
    rewards.append(episode_reward)

In [None]:
test_env(True)

### Generative Adversarial Imitation Learning

- https://arxiv.org/abs/1606.03476

In [None]:
class Discriminator(nn.Module):
    def __init__(self, num_inputs, hidden_size):
        super(Discriminator, self).__init__()
        
        self.linear1   = nn.Linear(num_inputs, hidden_size)
        self.linear2   = nn.Linear(hidden_size, hidden_size)
        self.linear3   = nn.Linear(hidden_size, 1)
        self.linear3.weight.data.mul_(0.1)
        self.linear3.bias.data.mul_(0.0)
    
    def forward(self, x):
        x = F.tanh(self.linear1(x))
        x = F.tanh(self.linear2(x))
        prob = F.sigmoid(self.linear3(x))
        return prob

In [None]:
def expert_reward(state, action):
    state = state.cpu().numpy()
    state_action = torch.FloatTensor(np.concatenate([state, action], 1)).to(device)
    
    return -np.log(discriminator(state_action).cpu().data.numpy())

In [None]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

In [None]:
#Set hyper params
a2c_hidden_size      = 256
discrim_hidden_size  = 128
lr                   = 3e-3
num_steps            = 20
mini_batch_size      = 5
ppo_epochs           = 4
threshold_reward     = -200

In [None]:
model = ActorCritic(num_inputs, num_outputs, a2c_hidden_size).to(device)

In [None]:
discriminator = Discriminator(num_inputs + num_outputs, discrim_hidden_size).to(device)

In [None]:
discrim_criterion = nn.BCELoss()

In [None]:
optimizer  = optim.Adam(model.parameters(), lr=lr)
optimizer_discrim = optim.Adam(discriminator.parameters(), lr=lr)

In [None]:
# Training configuration
test_rewards = []
max_frames = 100000
frame_idx = 0

In [None]:
i_update = 0
state = envs.reset()
early_stop = False

while frame_idx < max_frames and not early_stop:
    i_update += 1
    
    log_probs = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)

        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())
        reward = expert_reward(state, action.cpu().numpy())
        
        log_prob = dist.log_prob(action)
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        
        states.append(state)
        actions.append(action)
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % 1000 == 0:
            test_reward = np.mean([test_env() for _ in range(10)])
            test_rewards.append(test_reward)
            plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
            

    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, rewards, masks, values)

    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
    
    if i_update % 3 == 0:
        ppo_update(4, mini_batch_size, states, actions, log_probs, returns, advantage)
    
    
    expert_state_action = expert_traj[np.random.randint(0, expert_traj.shape[0], 2 * num_steps * num_envs), :]
    expert_state_action = torch.FloatTensor(expert_state_action).to(device)
    state_action        = torch.cat([states, actions], 1)
    fake = discriminator(state_action)
    real = discriminator(expert_state_action)
    optimizer_discrim.zero_grad()
    discrim_loss = discrim_criterion(fake, torch.ones((states.shape[0], 1)).to(device)) + \
            discrim_criterion(real, torch.zeros((expert_state_action.size(0), 1)).to(device))
    discrim_loss.backward()
    optimizer_discrim.step()

In [None]:
test_env(True)

### Hindsight Experience Replay (HER)

Hindsight Experience Replay since it replays experience (a technique often used in off-policy RL algorithms like DQN and DDPG, with goals which are chosen in hindsight, after the episode has finished. HER can therefore be combined with any off-policy RL algorithm (e.g. HER can be combined with DDPG)

TO BE ADDED!