a very simple example to show how to implement deep deterministic policy gradient (ddpg) using pytorch, it only require gym, pytorch and numpy installed to run this notebook, no external files or other libraries is needed, everything needed to work is contained within this notebook. I believe codes written in this way is the most readable

-freddy chua

In [1]:
import gym
from gym import wrappers
import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
import random
import numpy as np
from collections import deque

In [2]:
env = gym.make('Pendulum-v0')

[2017-07-09 18:25:07,288] Making new env: Pendulum-v0


In [3]:
env = wrappers.Monitor(env, 'pendulum', force=True)

[2017-07-09 18:25:07,318] Clearing 20 monitor files from previous run (because force=True was provided)


In [4]:
# the memory
Event = namedtuple('Event', ['state', 'action', 'next_state', 'reward'])

class Memory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.buffer = deque(maxlen=self.capacity)

  def add_event(self, event):
    self.buffer.append(event)
  
  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

# end class

In [5]:
class Actor(nn.Module):
  def __init__(self, num_states, num_actions):
    super(Actor, self).__init__()
    self.fc1 = nn.Linear(num_states, 100)
    self.fc2 = nn.Linear(100, 50)
    self.fc3 = nn.Linear(50, 10)
    self.fc4 = nn.Linear(10, num_actions)
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    nn.init.xavier_normal(self.fc3.weight)
    nn.init.xavier_normal(self.fc4.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    nn.init.normal(self.fc3.bias)
    nn.init.normal(self.fc4.bias)
    # =============================== 
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = F.tanh(self.fc4(x))
    return x

In [6]:
class Critic(nn.Module):
  def __init__(self, num_states, num_actions):
    super(Critic, self).__init__()
    self.fc1 = nn.Linear(num_states + num_actions, 100)
    self.fc2 = nn.Linear(100, 50)
    self.fc3 = nn.Linear(50, 10)
    self.fc4 = nn.Linear(10, num_actions)
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    nn.init.xavier_normal(self.fc3.weight)
    nn.init.xavier_normal(self.fc4.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    nn.init.normal(self.fc3.bias)
    nn.init.normal(self.fc4.bias)
    # ===============================
    
  def forward(self, states, actions):
    x = torch.cat((states, actions), 1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)
    return x

In [7]:
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
print('num_states = {0}, num_actions = {1}'.format(num_states, num_actions))

num_states = 3, num_actions = 1


In [8]:
eval_actor = Actor(num_states, num_actions)
target_actor = Actor(num_states, num_actions)
target_actor.load_state_dict(eval_actor.state_dict())

eval_critic = Critic(num_states, num_actions)
target_critic = Critic(num_states, num_actions)
target_critic.load_state_dict(eval_critic.state_dict())

In [9]:
batch_size = 100 # for batch processing, larger batch size -> faster computation
gamma = 0.99 # the parameter for discounting future rewards
tau = 0.001
decay = 0.99
epsilon = 2.0

In [12]:
criterion = nn.MSELoss() # mean squared error, similar to least squared error
# critic_optimizer = torch.optim.Adam(eval_critic.parameters(), lr=1e-3)
# actor_optimizer = torch.optim.Adam(eval_actor.parameters(), lr=1e-4)
critic_optimizer = torch.optim.RMSprop(eval_critic.parameters(), lr=1e-3) # RMSprop for learning eval_Q parameters
actor_optimizer = torch.optim.RMSprop(eval_actor.parameters(), lr=1e-4)

In [14]:
replay_memory = Memory(100000) # create a replay memory of capacity 10
top_score = -1e9

In [15]:
for i in range(500):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  while not done:    
    action = eval_actor(Variable(torch.Tensor(current_state).unsqueeze_(0), volatile=True))
    action = torch.squeeze(action.data).numpy() + np.random.randn(1) * epsilon
    action = np.maximum(-2.0, np.minimum(action, 2.0)) # this is a domain specific 'hack'
    
    next_state, reward, done, _ = env.step(action)
    episode_reward += reward
    if done:
      replay_memory.add_event(Event(current_state.copy(), action, None, reward))
    else:
      replay_memory.add_event(Event(current_state.copy(), action, next_state.copy(), reward))
    # end if
    current_state = next_state
    
    # train
    if len(replay_memory.buffer) >= batch_size:
      # sample from replay memory
      mini_batch = replay_memory.sample(batch_size)
      mini_batch = Event(*zip(*mini_batch)) # do this for batch processing
      
      state_var = Variable(torch.Tensor(mini_batch.state))
      action_var = Variable(torch.FloatTensor(mini_batch.action))
      
      estimated_value = eval_critic(state_var, action_var)
      
      mask = torch.ByteTensor(tuple(map(lambda s: s is not None, mini_batch.next_state)))
      
      valid_next_states = Variable(torch.Tensor([
        next_state for next_state in mini_batch.next_state if next_state is not None]))
      
      target_val = target_critic(valid_next_states, target_actor(valid_next_states))
            
      targetted_value = Variable(torch.zeros(batch_size, 1))
      targetted_value[mask] = gamma * target_val
      targetted_value += Variable(torch.Tensor(mini_batch.reward).unsqueeze_(1))
            
      # gradient descent on the critic
      critic_optimizer.zero_grad()
      critic_loss = criterion(estimated_value, targetted_value.detach()) # minimize the mse difference
      critic_loss.backward()
      critic_optimizer.step()
      
      # gradient descent on the actor
      actor_optimizer.zero_grad()
      actor_loss = - eval_critic(state_var, eval_actor(state_var)).mean() # maximize the value of taking action from the policy given by the actor
      actor_loss.backward()
      actor_optimizer.step()
      
#       print('critic_loss = {0}, actor_loss = {1}'.format(critic_loss.data[0], actor_loss.data[0]))
       
      # transfer the parameters from eval to target
      for target_param, eval_param in zip(target_critic.parameters(), eval_critic.parameters()):
        target_param.data.copy_(tau * eval_param.data + (1 - tau) * target_param.data)
      
      for target_param, eval_param in zip(target_actor.parameters(), eval_actor.parameters()):
        target_param.data.copy_(tau * eval_param.data + (1 - tau) * target_param.data)
    # end if
  # end while
  print('episode {0} reward = {1} epsilon = {2}'.format(i, episode_reward, epsilon))
  top_score = max(top_score, episode_reward)
  epsilon *= decay
# end for
print('top_score = {0}'.format(top_score))

[2017-07-09 18:25:49,142] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000000.mp4
[2017-07-09 18:25:53,168] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000001.mp4


episode 0 reward = -1482.9895357534376 epsilon = 2.0
episode 1 reward = -999.5264833621925 epsilon = 1.98
episode 2 reward = -1054.7064778529025 epsilon = 1.9602
episode 3 reward = -1555.0298248701017 epsilon = 1.9405979999999998
episode 4 reward = -1506.1386135469716 epsilon = 1.92119202
episode 5 reward = -1386.3748872560438 epsilon = 1.9019800997999998
episode 6 reward = -1674.3671643724858 epsilon = 1.8829602988019998


[2017-07-09 18:26:03,026] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000008.mp4


episode 7 reward = -1401.6516214026058 epsilon = 1.8641306958139798
episode 8 reward = -1295.2689222876625 epsilon = 1.84548938885584
episode 9 reward = -1790.4908382002147 epsilon = 1.8270344949672814
episode 10 reward = -1589.294724231132 epsilon = 1.8087641500176086
episode 11 reward = -1753.6689409536125 epsilon = 1.7906765085174325
episode 12 reward = -1747.776494316601 epsilon = 1.7727697434322582
episode 13 reward = -1723.1691296879435 epsilon = 1.7550420459979357
episode 14 reward = -1514.5198257911109 epsilon = 1.7374916255379562
episode 15 reward = -1331.4286795174214 epsilon = 1.7201167092825767
episode 16 reward = -1073.3334547447234 epsilon = 1.7029155421897508
episode 17 reward = -1763.1204539725031 epsilon = 1.6858863867678533
episode 18 reward = -1781.224380530356 epsilon = 1.6690275229001748
episode 19 reward = -1242.7950203275168 epsilon = 1.652337247671173
episode 20 reward = -1763.6858663119262 epsilon = 1.6358138751944613
episode 21 reward = -1266.073773112578 epsi

[2017-07-09 18:26:27,845] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000027.mp4


episode 26 reward = -1758.1892216516035 epsilon = 1.5400862916103102
episode 27 reward = -1085.683065076302 epsilon = 1.524685428694207
episode 28 reward = -1585.181835041169 epsilon = 1.509438574407265
episode 29 reward = -1728.5205274326688 epsilon = 1.4943441886631923
episode 30 reward = -1138.0561235027947 epsilon = 1.4794007467765604
episode 31 reward = -1452.3664932465097 epsilon = 1.4646067393087947
episode 32 reward = -1152.7154113783924 epsilon = 1.4499606719157068
episode 33 reward = -1150.5253625006974 epsilon = 1.4354610651965496
episode 34 reward = -1160.92556411717 epsilon = 1.4211064545445842
episode 35 reward = -1000.3722060016518 epsilon = 1.4068953899991383
episode 36 reward = -1215.7063904381318 epsilon = 1.392826436099147
episode 37 reward = -875.5638397161497 epsilon = 1.3788981717381554
episode 38 reward = -987.8194190109926 epsilon = 1.365109190020774
episode 39 reward = -1107.314306201686 epsilon = 1.3514580981205662
episode 40 reward = -757.251267160954 epsilon

[2017-07-09 18:27:13,089] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000064.mp4


episode 63 reward = -513.7579529625892 epsilon = 1.0618110859102263
episode 64 reward = -884.439966889101 epsilon = 1.051192975051124
episode 65 reward = -737.6229350904517 epsilon = 1.0406810453006128
episode 66 reward = -524.7473892618615 epsilon = 1.0302742348476066
episode 67 reward = -633.097206565298 epsilon = 1.0199714924991305
episode 68 reward = -626.721637522745 epsilon = 1.0097717775741393
episode 69 reward = -508.22727337956354 epsilon = 0.9996740597983979
episode 70 reward = -526.6642191686301 epsilon = 0.9896773192004139
episode 71 reward = -626.1520270189411 epsilon = 0.9797805460084098
episode 72 reward = -631.1109279719249 epsilon = 0.9699827405483257
episode 73 reward = -495.4945439705794 epsilon = 0.9602829131428424
episode 74 reward = -747.6435198080515 epsilon = 0.950680084011414
episode 75 reward = -504.7970655102228 epsilon = 0.9411732831712999
episode 76 reward = -406.45788516726077 epsilon = 0.9317615503395869
episode 77 reward = -496.286909581097 epsilon = 0.9

[2017-07-09 18:28:24,933] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000125.mp4


episode 124 reward = -245.65968334156784 epsilon = 0.5751672187337281
episode 125 reward = -481.69260722716626 epsilon = 0.5694155465463908
episode 126 reward = -123.31414910311966 epsilon = 0.5637213910809269
episode 127 reward = -124.25506855832492 epsilon = 0.5580841771701176
episode 128 reward = -126.81451514546634 epsilon = 0.5525033353984165
episode 129 reward = -633.3362651636518 epsilon = 0.5469783020444323
episode 130 reward = -245.31385446992695 epsilon = 0.541508519023988
episode 131 reward = -123.63513994427589 epsilon = 0.5360934338337481
episode 132 reward = -385.23479261078364 epsilon = 0.5307324994954106
episode 133 reward = -1.6686817469031154 epsilon = 0.5254251745004564
episode 134 reward = -123.44259565603015 epsilon = 0.5201709227554518
episode 135 reward = -126.62581818172012 epsilon = 0.5149692135278974
episode 136 reward = -124.8155107411681 epsilon = 0.5098195213926184
episode 137 reward = -500.5304745994967 epsilon = 0.5047213261786923
episode 138 reward = -12

[2017-07-09 18:30:16,933] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000216.mp4


episode 215 reward = -123.6416248685766 epsilon = 0.23046067742742682
episode 216 reward = -123.77391728915748 epsilon = 0.22815607065315255
episode 217 reward = -237.38425559009272 epsilon = 0.22587450994662103
episode 218 reward = -125.59638150409236 epsilon = 0.2236157648471548
episode 219 reward = -127.49884287980284 epsilon = 0.22137960719868327
episode 220 reward = -352.49714188550854 epsilon = 0.21916581112669645
episode 221 reward = -125.22582710323205 epsilon = 0.2169741530154295
episode 222 reward = -355.0095194221157 epsilon = 0.2148044114852752
episode 223 reward = -0.6252260131325024 epsilon = 0.21265636737042246
episode 224 reward = -604.2800714579759 epsilon = 0.21052980369671823
episode 225 reward = -123.43261146708127 epsilon = 0.20842450565975104
episode 226 reward = -491.1310016383341 epsilon = 0.20634026060315352
episode 227 reward = -122.42714747186866 epsilon = 0.20427685799712197
episode 228 reward = -119.93383839470025 epsilon = 0.20223408941715076
episode 229 r

episode 332 reward = -471.5907114127014 epsilon = 0.07110736771894845
episode 333 reward = -125.41438943578281 epsilon = 0.07039629404175896
episode 334 reward = -0.29882104221216926 epsilon = 0.06969233110134136
episode 335 reward = -126.35122512923421 epsilon = 0.06899540779032795
episode 336 reward = -239.05540779175575 epsilon = 0.06830545371242468
episode 337 reward = -122.19107919381301 epsilon = 0.06762239917530043
episode 338 reward = -127.30316172394991 epsilon = 0.06694617518354742
episode 339 reward = -122.90934893000092 epsilon = 0.06627671343171194
episode 340 reward = -235.8361169128758 epsilon = 0.06561394629739482
episode 341 reward = -472.2648476560204 epsilon = 0.06495780683442087


[2017-07-09 18:32:43,072] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/pendulum/openaigym.video.0.95548.video000343.mp4


episode 342 reward = -122.64493804262108 epsilon = 0.06430822876607667
episode 343 reward = -480.4631814214055 epsilon = 0.0636651464784159
episode 344 reward = -493.9431401839924 epsilon = 0.06302849501363174
episode 345 reward = -0.6741432296058197 epsilon = 0.06239821006349542
episode 346 reward = -364.75856013557336 epsilon = 0.061774227962860466
episode 347 reward = -246.57534091394314 epsilon = 0.06115648568323186
episode 348 reward = -351.78331008500317 epsilon = 0.06054492082639954
episode 349 reward = -364.23785778875185 epsilon = 0.059939471618135544
episode 350 reward = -121.44753368234063 epsilon = 0.05934007690195419
episode 351 reward = -560.7983553633284 epsilon = 0.05874667613293465
episode 352 reward = -123.23829827226972 epsilon = 0.0581592093716053
episode 353 reward = -246.7849714439242 epsilon = 0.057577617277889244
episode 354 reward = -506.5306591140679 epsilon = 0.05700184110511035
episode 355 reward = -348.92271101917015 epsilon = 0.05643182269405925
episode 35

episode 458 reward = -355.1181632962849 epsilon = 0.02004237212331432
episode 459 reward = -236.15131185577835 epsilon = 0.019841948402081176
episode 460 reward = -125.83639311414326 epsilon = 0.019643528918060364
episode 461 reward = -124.77359373371968 epsilon = 0.01944709362887976
episode 462 reward = -125.38934687064555 epsilon = 0.01925262269259096
episode 463 reward = -0.32806030683447013 epsilon = 0.01906009646566505
episode 464 reward = -124.9250151895693 epsilon = 0.018869495501008398
episode 465 reward = -244.6448264259286 epsilon = 0.018680800545998313
episode 466 reward = -124.74344100888918 epsilon = 0.01849399254053833
episode 467 reward = -583.2805196800344 epsilon = 0.018309052615132947
episode 468 reward = -120.36963501157805 epsilon = 0.018125962088981616
episode 469 reward = -370.20528827469616 epsilon = 0.0179447024680918
episode 470 reward = -124.76140485720842 epsilon = 0.01776525544341088
episode 471 reward = -240.8517460925309 epsilon = 0.01758760288897677
episo

In [None]:
env.render(close=True)
env.close()