a very simple example to show how to implement deep deterministic policy gradient (ddpg) using pytorch, it only require gym, pytorch and numpy installed to run this notebook, no external files or other libraries is needed, everything needed to work is contained within this notebook. I believe codes written in this way is the most readable

-freddy chua

In [1]:
import gym
from gym import wrappers
import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
import random
import numpy as np
from collections import deque

In [2]:
env = gym.make('Pendulum-v0')

[2017-07-10 10:28:59,553] Making new env: Pendulum-v0


In [None]:
env = wrappers.Monitor(env, 'pendulum', force=True)

In [3]:
# Copied from https://github.com/ghliu/pytorch-ddpg/blob/master/random_process.py
import numpy as np

# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)


In [4]:
# the memory
Event = namedtuple('Event', ['state', 'action', 'next_state', 'reward'])

class Memory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.buffer = deque(maxlen=self.capacity)

  def add_event(self, event):
    self.buffer.append(event)
  
  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

# end class

In [5]:
class Actor(nn.Module):
  def __init__(self, num_states, num_actions):
    super(Actor, self).__init__()
    self.fc1 = nn.Linear(num_states, 100)
    self.fc2 = nn.Linear(100, 50)
    self.fc3 = nn.Linear(50, 10)
    self.fc4 = nn.Linear(10, num_actions)
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    nn.init.xavier_normal(self.fc3.weight)
    nn.init.xavier_normal(self.fc4.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    nn.init.normal(self.fc3.bias)
    nn.init.normal(self.fc4.bias)
    # =============================== 
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = F.tanh(self.fc4(x))
    return x

In [6]:
class Critic(nn.Module):
  def __init__(self, num_states, num_actions):
    super(Critic, self).__init__()
    self.fc1 = nn.Linear(num_states, 100)
    self.fc2 = nn.Linear(100 + num_actions, 50)
    self.fc3 = nn.Linear(50, 10)
    self.fc4 = nn.Linear(10, num_actions)
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    nn.init.xavier_normal(self.fc3.weight)
    nn.init.xavier_normal(self.fc4.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    nn.init.normal(self.fc3.bias)
    nn.init.normal(self.fc4.bias)
    # ===============================
    
  def forward(self, states, actions):
    x = F.relu(self.fc1(states))
    x = torch.cat((x, actions), 1) # actions only join at second layer
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)
    return x

In [7]:
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
print('num_states = {0}, num_actions = {1}'.format(num_states, num_actions))

num_states = 3, num_actions = 1


In [21]:
eval_actor = Actor(num_states, num_actions)
target_actor = Actor(num_states, num_actions)
target_actor.load_state_dict(eval_actor.state_dict())

eval_critic = Critic(num_states, num_actions)
target_critic = Critic(num_states, num_actions)
target_critic.load_state_dict(eval_critic.state_dict())

In [22]:
batch_size = 64 # for batch processing, larger batch size -> faster computation
gamma = 0.99 # the parameter for discounting future rewards
tau = 0.001

In [23]:
random_process = OrnsteinUhlenbeckProcess(theta=0.15, sigma=0.2)

In [24]:
criterion = nn.MSELoss() # mean squared error, similar to least squared error
critic_optimizer = torch.optim.Adam(eval_critic.parameters(), lr=1e-3, weight_decay=1e-2)
actor_optimizer = torch.optim.Adam(eval_actor.parameters(), lr=1e-4)

In [25]:
replay_memory = Memory(1000000) # create a replay memory of capacity 10
top_score = -1e9

In [26]:
for i in range(500):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  random_process.reset_states()
  while not done:    
    action = eval_actor(Variable(torch.Tensor(current_state).unsqueeze_(0), volatile=True))
    action = torch.squeeze(action.data).numpy() + random_process.sample() # np.random.randn(1) * epsilon
    action = np.maximum(-2.0, np.minimum(action, 2.0)) # this is a domain specific 'hack'
    
    next_state, reward, done, _ = env.step(action)
    episode_reward += reward
    if done:
      replay_memory.add_event(Event(current_state.copy(), action, None, reward))
    else:
      replay_memory.add_event(Event(current_state.copy(), action, next_state.copy(), reward))
    # end if
    current_state = next_state
    
    # train
    if len(replay_memory.buffer) >= batch_size:
      # sample from replay memory
      mini_batch = replay_memory.sample(batch_size)
      mini_batch = Event(*zip(*mini_batch)) # do this for batch processing
      
      state_var = Variable(torch.Tensor(mini_batch.state))
      action_var = Variable(torch.FloatTensor(mini_batch.action))
      
      estimated_value = eval_critic(state_var, action_var)
      
      mask = torch.ByteTensor(tuple(map(lambda s: s is not None, mini_batch.next_state)))
      
      valid_next_states = Variable(torch.Tensor([
        next_state for next_state in mini_batch.next_state if next_state is not None]))
      
      target_val = target_critic(valid_next_states, target_actor(valid_next_states))
            
      targetted_value = Variable(torch.zeros(batch_size, 1))
      targetted_value[mask] = gamma * target_val
      targetted_value += Variable(torch.Tensor(mini_batch.reward).unsqueeze_(1))
            
      # gradient descent on the critic
      critic_optimizer.zero_grad()
      critic_loss = criterion(estimated_value, targetted_value.detach()) # minimize the mse difference
      critic_loss.backward()
      critic_optimizer.step()
      
      # gradient descent on the actor
      actor_optimizer.zero_grad()
      actor_loss = - eval_critic(state_var, eval_actor(state_var)).mean() # maximize the value of taking action from the policy given by the actor
      actor_loss.backward()
      actor_optimizer.step()
      
#       print('critic_loss = {0}, actor_loss = {1}'.format(critic_loss.data[0], actor_loss.data[0]))
       
      # transfer the parameters from eval to target
      for target_param, eval_param in zip(target_critic.parameters(), eval_critic.parameters()):
        target_param.data.copy_(tau * eval_param.data + (1 - tau) * target_param.data)
      
      for target_param, eval_param in zip(target_actor.parameters(), eval_actor.parameters()):
        target_param.data.copy_(tau * eval_param.data + (1 - tau) * target_param.data)
    # end if
  # end while
  print('episode {0} reward = {1}'.format(i, episode_reward))
  top_score = max(top_score, episode_reward)
# end for
print('top_score = {0}'.format(top_score))

episode 0 reward = -1505.9622768600152
episode 1 reward = -1205.9715649284817
episode 2 reward = -1079.8275605920855
episode 3 reward = -1696.0930437302766
episode 4 reward = -1597.0529856291969
episode 5 reward = -1350.0564437421583
episode 6 reward = -1719.5281040251912
episode 7 reward = -1317.970159530352
episode 8 reward = -1815.4557521248266
episode 9 reward = -1335.5395661386858
episode 10 reward = -1851.7267014092884
episode 11 reward = -1836.1471627973738
episode 12 reward = -1609.735701695334
episode 13 reward = -1391.7540127149682
episode 14 reward = -1704.4130037740833
episode 15 reward = -1713.7302530029344
episode 16 reward = -1587.307955765315
episode 17 reward = -1510.0334658416955
episode 18 reward = -1230.8279333135895
episode 19 reward = -1534.2239606043447
episode 20 reward = -1677.1168239033013
episode 21 reward = -1665.9501138458902
episode 22 reward = -1595.6248069209175
episode 23 reward = -1667.3355139008381
episode 24 reward = -1561.6668184902499
episode 25 re

episode 206 reward = -125.48724516292017
episode 207 reward = -125.63472582288126
episode 208 reward = -353.097783374991
episode 209 reward = -371.5771747951844
episode 210 reward = -485.4188861247438
episode 211 reward = -365.27888543973296
episode 212 reward = -376.75317418842013
episode 213 reward = -239.81869839440301
episode 214 reward = -126.22898446010946
episode 215 reward = -499.9342944791435
episode 216 reward = -124.50231298836826
episode 217 reward = -237.8446660963411
episode 218 reward = -363.709761580234
episode 219 reward = -125.96952747523983
episode 220 reward = -239.54521602518065
episode 221 reward = -477.6766978254853
episode 222 reward = -354.8604808903888
episode 223 reward = -125.17632272317486
episode 224 reward = -241.41704779019003
episode 225 reward = -238.53522730532003
episode 226 reward = -126.33842126721127
episode 227 reward = -124.70366976541735
episode 228 reward = -126.91087680336125
episode 229 reward = -125.79348878840023
episode 230 reward = -123.

episode 408 reward = -476.17599630369506
episode 409 reward = -123.11430100468958
episode 410 reward = -124.06640023921872
episode 411 reward = -373.39324771161625
episode 412 reward = -124.34235436565228
episode 413 reward = -585.8591314258287
episode 414 reward = -125.4752163870507
episode 415 reward = -236.8702359033666
episode 416 reward = -124.62705268611295
episode 417 reward = -355.27497061380416
episode 418 reward = -362.58610478881394
episode 419 reward = -126.75063343268106
episode 420 reward = -124.79286140429687
episode 421 reward = -493.11679808914937
episode 422 reward = -125.98015317560146
episode 423 reward = -362.3551727417037
episode 424 reward = -479.6187040344561
episode 425 reward = -240.69977171071844
episode 426 reward = -239.76515362377745
episode 427 reward = -466.3748391427061
episode 428 reward = -127.77768921211033
episode 429 reward = -126.9699787210834
episode 430 reward = -125.58586591280033
episode 431 reward = -126.1562619454706
episode 432 reward = -24

In [None]:
env.render(close=True)
env.close()