a very simple example to show how to implement deep q networks using pytorch, only require gym and pytorch installed, no external files or other libraries, everything needed to work is contained within this notebook

-freddy chua

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
import random

In [2]:
env = gym.make('CartPole-v0')

[2017-07-04 22:18:21,257] Making new env: CartPole-v0


In [39]:
# try implementing dqn

# the action reward value function can be represented by a mlp
class Mlp(nn.Module):
  def __init__(self, input_size, output_size):
    super(Mlp, self).__init__() # this statement is always needed
    
    self.fc1 = nn.Linear(input_size, 10) # matrix multiplication
    self.fc2 = nn.Linear(10, output_size) # matrix multiplication
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    # =============================== 
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x
  
  # no backward function needed, awesome!
# end class

In [40]:
# the memory
Event = namedtuple('Event', ['state', 'action', 'next_state', 'reward'])

class Memory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.idx = 0
    self.mem = []

  def add_event(self, event):
    if len(self.mem) < self.capacity:
      self.mem.append(event)
    else:
      self.mem[self.idx] = event
    self.idx = (self.idx + 1) % self.capacity
  
  def sample(self, batch_size):
    return random.sample(self.mem, batch_size)

# end class

In [41]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
print('input_size = {0}, output_size = {1}'.format(input_size, output_size))

input_size = 4, output_size = 2


In [54]:
# create 2 Q-network

eval_Q   = Mlp(input_size, output_size)
target_Q = Mlp(input_size, output_size)
target_Q.load_state_dict(eval_Q.state_dict()) # set them to be similar

In [55]:
epsilon = 1.0 # the exploration decision parameter, will decay over time
batch_size = 100 # for batch processing, larger batch size -> faster computation
gamma = 0.9 # the parameter for discounting future rewards
C = 20 # the time delay in updating target_Q

In [56]:
optimizer = torch.optim.RMSprop(eval_Q.parameters()) # RMSprop for learning eval_Q parameters
criterion = nn.MSELoss() # mean squared error, similar to least squared error

In [57]:
replay_memory = Memory(10000) # create a replay memory of capacity 10
top_score = 0
c = 0
for i in range(500):
#   print('episode: {0}'.format(i+1))
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  while not done:
    if random.random() < epsilon:
      # perform random action to explore the search space
      action = env.action_space.sample()
    else:
      # choose action with highest value
      state_tensor = torch.Tensor(current_state) # convert current_state into a torch tensor
      state_tensor = state_tensor.unsqueeze_(0) # unsqueeze to allow for batch processing
      # convert to a autograd Variable for automatic backpropagation
      state_tensor = Variable(state_tensor, volatile=True) # volatile is True for inference only
      action_values = eval_Q(state_tensor) # forward
      
      _, action = torch.max(action_values, 1)
      action = action.data[0,0]
    # end if
    next_state, reward, done, _ = env.step(action)
    episode_reward += reward
    if done:
      replay_memory.add_event(Event(current_state.copy(), action, None, reward))
    else:
      replay_memory.add_event(Event(current_state.copy(), action, next_state.copy(), reward))
    # end if
    current_state = next_state
    
    # train
    if len(replay_memory.mem) >= batch_size:
      # sample from replay memory
      mini_batch = replay_memory.sample(batch_size)
      mini_batch = Event(*zip(*mini_batch)) # do this for batch processing
      
      # calculate the estimated value
      estimated_value = eval_Q(Variable(torch.Tensor(mini_batch.state)))
      # select the value associated with the action taken
      estimated_value = estimated_value.gather(1, Variable(torch.LongTensor(mini_batch.action).unsqueeze_(1)))
      
      # calculate the actual value
      mask = torch.ByteTensor(tuple(map(lambda s: s is not None, mini_batch.next_state)))
      target_val = target_Q(Variable(torch.Tensor([
        next_state for next_state in mini_batch.next_state if next_state is not None])))
      target_val, _ = torch.max(target_val, 1)
      
      targetted_value = Variable(torch.zeros(batch_size, 1))
      targetted_value[mask] = gamma * target_val
      targetted_value += Variable(torch.Tensor(mini_batch.reward).unsqueeze_(1))
      
      # compute the loss between estimated value and actual value
      optimizer.zero_grad()
      loss = criterion(estimated_value, targetted_value.detach())      
      loss.backward()
      optimizer.step() # do a gradient descent on it
      
      c += 1
      if c == C:
        c = 0
        target_Q.load_state_dict(eval_Q.state_dict())
        epsilon = epsilon * 0.9
      # end if
    # end if
    
  # end while
  print('episode {0} reward = {1}, epsilon = {2:3g}'.format(i, episode_reward, epsilon))
  top_score = max(top_score, episode_reward)
# end for
print('top_score = {0}'.format(top_score))

episode 0 reward = 13.0, epsilon =   1
episode 1 reward = 20.0, epsilon =   1
episode 2 reward = 12.0, epsilon =   1
episode 3 reward = 29.0, epsilon =   1
episode 4 reward = 14.0, epsilon =   1
episode 5 reward = 12.0, epsilon =   1
episode 6 reward = 14.0, epsilon =   1
episode 7 reward = 35.0, epsilon = 0.81
episode 8 reward = 12.0, epsilon = 0.729
episode 9 reward = 41.0, epsilon = 0.59049
episode 10 reward = 27.0, epsilon = 0.531441
episode 11 reward = 41.0, epsilon = 0.430467
episode 12 reward = 39.0, epsilon = 0.348678
episode 13 reward = 34.0, epsilon = 0.28243
episode 14 reward = 39.0, epsilon = 0.228768
episode 15 reward = 46.0, epsilon = 0.185302
episode 16 reward = 61.0, epsilon = 0.135085
episode 17 reward = 87.0, epsilon = 0.0886294
episode 18 reward = 141.0, epsilon = 0.0423912
episode 19 reward = 136.0, epsilon = 0.0202756
episode 20 reward = 130.0, epsilon = 0.00969774
episode 21 reward = 149.0, epsilon = 0.0046384
episode 22 reward = 120.0, epsilon = 0.00246503
episod

episode 170 reward = 95.0, epsilon = 7.06146e-58
episode 171 reward = 200.0, epsilon = 2.46218e-58
episode 172 reward = 197.0, epsilon = 8.58508e-59
episode 173 reward = 200.0, epsilon = 2.99343e-59
episode 174 reward = 145.0, epsilon = 1.43175e-59
episode 175 reward = 200.0, epsilon = 4.9922e-60
episode 176 reward = 169.0, epsilon = 1.93408e-60
episode 177 reward = 24.0, epsilon = 1.74067e-60
episode 178 reward = 156.0, epsilon = 7.49303e-61
episode 179 reward = 145.0, epsilon = 3.58389e-61
episode 180 reward = 169.0, epsilon = 1.38847e-61
episode 181 reward = 121.0, epsilon = 7.37892e-62
episode 182 reward = 105.0, epsilon = 4.35718e-62
episode 183 reward = 131.0, epsilon = 2.31558e-62
episode 184 reward = 145.0, epsilon = 9.96782e-63
episode 185 reward = 48.0, epsilon = 8.07394e-63
episode 186 reward = 186.0, epsilon = 3.12801e-63
episode 187 reward = 167.0, epsilon = 1.21185e-63
episode 188 reward = 139.0, epsilon = 5.79626e-64
episode 189 reward = 27.0, epsilon = 5.21664e-64
episo

episode 335 reward = 113.0, epsilon = 1.01102e-116
episode 336 reward = 100.0, epsilon = 5.97e-117
episode 337 reward = 113.0, epsilon = 3.1727e-117
episode 338 reward = 136.0, epsilon = 1.51749e-117
episode 339 reward = 122.0, epsilon = 8.06458e-118
episode 340 reward = 104.0, epsilon = 4.76205e-118
episode 341 reward = 101.0, epsilon = 2.81194e-118
episode 342 reward = 121.0, epsilon = 1.49438e-118
episode 343 reward = 118.0, epsilon = 7.94176e-119
episode 344 reward = 123.0, epsilon = 4.22058e-119
episode 345 reward = 58.0, epsilon = 3.0768e-119
episode 346 reward = 118.0, epsilon = 1.63514e-119
episode 347 reward = 183.0, epsilon = 6.33486e-120
episode 348 reward = 12.0, epsilon = 5.70137e-120
episode 349 reward = 147.0, epsilon = 2.72695e-120
episode 350 reward = 97.0, epsilon = 1.61024e-120
episode 351 reward = 47.0, epsilon = 1.30429e-120
episode 352 reward = 13.0, epsilon = 1.17386e-120
episode 353 reward = 19.0, epsilon = 1.05648e-120
episode 354 reward = 21.0, epsilon = 9.508

episode 498 reward = 143.0, epsilon = 3.86075e-166
episode 499 reward = 141.0, epsilon = 1.84658e-166
top_score = 200.0
