In [1]:
import gym
from gym import wrappers
import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
import random
import numpy as np
from collections import deque

In [2]:
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, 'cartpole', force=True)

[2017-11-27 23:03:49,908] Making new env: CartPole-v0
[2017-11-27 23:03:49,933] Clearing 4 monitor files from previous run (because force=True was provided)


In [3]:
class Policy(nn.Module):
  def __init__(self, input_size, output_size):
    super(Policy, self).__init__() # this statement is always needed
    
    self.fc1 = nn.Linear(input_size, 10) # matrix multiplication
    self.fc2 = nn.Linear(10, output_size) # matrix multiplication
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    # =============================== 
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [4]:
class ValueFunction(nn.Module):
  def __init__(self, input_size):
    super(ValueFunction, self).__init__()
    
    self.fc1 = nn.Linear(input_size, 10)
    self.fc2 = nn.Linear(10, 1) # a single value
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    # ===============================
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [5]:
class ActorCritic(nn.Module):
  def __init__(self, input_size, output_size):
    super(ActorCritic, self).__init__()
    self.actor = Policy(input_size, output_size)
    self.critic = ValueFunction(input_size)

  def forward(self, x):
    return self.critic(x), self.actor(x) # value, action

In [6]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
print('input_size = {0}, output_size = {1}'.format(input_size, output_size))

input_size = 4, output_size = 2


In [7]:
gamma = 0.99 # the parameter for discounting future rewards
lbda = 0.9 # the parameter for GAE (generalized advantage estimation)

In [8]:
model = ActorCritic(input_size, output_size)
optimizer = torch.optim.Adam(model.parameters())

In [9]:
# vanilla policy gradient
for i in range(2000):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  
  values = [] 
  logprobs = []
  rewards = []

  while not done:
     # forward propagation on policy using current_state
    value, action_real = model(Variable(torch.from_numpy(current_state).float().unsqueeze(0))) 
    # value and action_real are variables
    action_logprob = F.log_softmax(action_real) # returns a variable
    
    action_prob = F.softmax(action_real) # returns a variable
    action = action_prob.multinomial().data # returns a torch tensor
    
    logprob = action_logprob.gather(1, Variable(action))
    current_state, reward, done, _ = env.step(action.numpy()[0,0])
    
    values.append(value) # variable
    logprobs.append(logprob) # variable
    rewards.append(reward) # numpy
    
    episode_reward += reward
  
  R = 0
  value_loss = 0
  policy_loss = 0
  for j in reversed(range(len(rewards))):
    R = rewards[j] + gamma * R # numpy
    advantage = R - values[j]
    value_loss += advantage.pow(2)
    policy_loss -= logprobs[j] * advantage
  
  optimizer.zero_grad()
  value_loss.backward(retain_variables=True)
  policy_loss.backward()
  optimizer.step()
  
  print('{0}: episode_rewards = {1}, policy_loss = {2}, value_loss = {3}'.format(
    i, episode_reward, policy_loss.data[0,0], value_loss.data[0,0]))

[2017-11-27 23:04:08,315] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000000.mp4
[2017-11-27 23:04:09,408] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000001.mp4


0: episode_rewards = 11.0, policy_loss = 44.216556549072266, value_loss = 650.34375


[2017-11-27 23:04:10,053] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000008.mp4


1: episode_rewards = 28.0, policy_loss = 298.5724182128906, value_loss = 7316.267578125
2: episode_rewards = 21.0, policy_loss = 164.59884643554688, value_loss = 3428.46826171875
3: episode_rewards = 13.0, policy_loss = 62.25950241088867, value_loss = 991.8350219726562
4: episode_rewards = 12.0, policy_loss = 54.986114501953125, value_loss = 808.2779541015625
5: episode_rewards = 12.0, policy_loss = 68.74180603027344, value_loss = 777.2911987304688
6: episode_rewards = 11.0, policy_loss = 60.19776153564453, value_loss = 621.7948608398438
7: episode_rewards = 16.0, policy_loss = 98.62602996826172, value_loss = 1687.5443115234375




8: episode_rewards = 11.0, policy_loss = 40.42898178100586, value_loss = 642.636474609375
9: episode_rewards = 21.0, policy_loss = 166.7332000732422, value_loss = 3406.75439453125
10: episode_rewards = 15.0, policy_loss = 81.10751342773438, value_loss = 1416.95361328125
11: episode_rewards = 15.0, policy_loss = 86.94470977783203, value_loss = 1418.1427001953125
12: episode_rewards = 15.0, policy_loss = 85.4642105102539, value_loss = 1420.504638671875
13: episode_rewards = 15.0, policy_loss = 91.18727111816406, value_loss = 1425.382568359375
14: episode_rewards = 12.0, policy_loss = 55.28855895996094, value_loss = 799.579833984375
15: episode_rewards = 25.0, policy_loss = 244.47914123535156, value_loss = 5360.80419921875
16: episode_rewards = 18.0, policy_loss = 126.79525756835938, value_loss = 2277.943115234375


[2017-11-27 23:04:10,673] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000027.mp4


17: episode_rewards = 38.0, policy_loss = 494.18475341796875, value_loss = 16106.47265625
18: episode_rewards = 10.0, policy_loss = 37.61907958984375, value_loss = 503.0838317871094
19: episode_rewards = 11.0, policy_loss = 45.965843200683594, value_loss = 640.5760498046875
20: episode_rewards = 16.0, policy_loss = 89.91012573242188, value_loss = 1665.63818359375
21: episode_rewards = 18.0, policy_loss = 119.37548065185547, value_loss = 2261.0693359375
22: episode_rewards = 12.0, policy_loss = 53.292686462402344, value_loss = 793.2205810546875
23: episode_rewards = 15.0, policy_loss = 96.06649780273438, value_loss = 1393.4212646484375
24: episode_rewards = 11.0, policy_loss = 57.003074645996094, value_loss = 622.34619140625
25: episode_rewards = 14.0, policy_loss = 80.45267486572266, value_loss = 1177.7318115234375
26: episode_rewards = 32.0, policy_loss = 351.70574951171875, value_loss = 10222.328125




27: episode_rewards = 15.0, policy_loss = 86.24864196777344, value_loss = 1404.958984375
28: episode_rewards = 15.0, policy_loss = 88.44280242919922, value_loss = 1400.9266357421875
29: episode_rewards = 18.0, policy_loss = 130.00564575195312, value_loss = 2266.146728515625
30: episode_rewards = 19.0, policy_loss = 138.5735321044922, value_loss = 2602.57470703125
31: episode_rewards = 13.0, policy_loss = 63.26320266723633, value_loss = 967.2552490234375
32: episode_rewards = 9.0, policy_loss = 30.730144500732422, value_loss = 380.0207214355469
33: episode_rewards = 25.0, policy_loss = 237.62451171875, value_loss = 5322.65673828125
34: episode_rewards = 13.0, policy_loss = 61.89451217651367, value_loss = 960.7030029296875
35: episode_rewards = 21.0, policy_loss = 163.598876953125, value_loss = 3370.518310546875
36: episode_rewards = 13.0, policy_loss = 68.72775268554688, value_loss = 967.4465942382812
37: episode_rewards = 16.0, policy_loss = 97.26411437988281, value_loss = 1653.8050537

[2017-11-27 23:04:11,828] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000064.mp4


55: episode_rewards = 25.0, policy_loss = 222.7940673828125, value_loss = 5293.73193359375
56: episode_rewards = 12.0, policy_loss = 56.217525482177734, value_loss = 765.1885375976562
57: episode_rewards = 14.0, policy_loss = 83.7933349609375, value_loss = 1143.230224609375
58: episode_rewards = 23.0, policy_loss = 193.3460235595703, value_loss = 4242.73046875
59: episode_rewards = 11.0, policy_loss = 52.25811004638672, value_loss = 609.2244873046875
60: episode_rewards = 30.0, policy_loss = 317.876220703125, value_loss = 8565.314453125
61: episode_rewards = 10.0, policy_loss = 41.42536544799805, value_loss = 481.3644714355469
62: episode_rewards = 17.0, policy_loss = 112.01714324951172, value_loss = 1907.4481201171875
63: episode_rewards = 11.0, policy_loss = 49.32096862792969, value_loss = 607.84326171875




64: episode_rewards = 13.0, policy_loss = 72.66355895996094, value_loss = 941.2955322265625
65: episode_rewards = 25.0, policy_loss = 224.84681701660156, value_loss = 5274.5849609375
66: episode_rewards = 22.0, policy_loss = 182.44842529296875, value_loss = 3757.931640625
67: episode_rewards = 48.0, policy_loss = 738.0908203125, value_loss = 29244.296875
68: episode_rewards = 18.0, policy_loss = 123.21417236328125, value_loss = 2211.73193359375
69: episode_rewards = 13.0, policy_loss = 65.96382141113281, value_loss = 932.6232299804688
70: episode_rewards = 23.0, policy_loss = 196.18856811523438, value_loss = 4208.85693359375
71: episode_rewards = 13.0, policy_loss = 67.49333190917969, value_loss = 937.54052734375
72: episode_rewards = 9.0, policy_loss = 37.31221008300781, value_loss = 361.53778076171875
73: episode_rewards = 16.0, policy_loss = 98.19729614257812, value_loss = 1610.3502197265625
74: episode_rewards = 29.0, policy_loss = 293.452880859375, value_loss = 7803.53857421875
75

[2017-11-27 23:04:14,031] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000125.mp4


119: episode_rewards = 52.0, policy_loss = 833.5734252929688, value_loss = 35538.734375
120: episode_rewards = 45.0, policy_loss = 640.811767578125, value_loss = 24440.0546875
121: episode_rewards = 14.0, policy_loss = 77.96585845947266, value_loss = 1100.3280029296875
122: episode_rewards = 19.0, policy_loss = 137.51622009277344, value_loss = 2469.81005859375
123: episode_rewards = 17.0, policy_loss = 109.92646789550781, value_loss = 1853.939453125
124: episode_rewards = 21.0, policy_loss = 162.13111877441406, value_loss = 3260.1484375




125: episode_rewards = 16.0, policy_loss = 98.751953125, value_loss = 1572.20458984375
126: episode_rewards = 16.0, policy_loss = 98.66185760498047, value_loss = 1576.4686279296875
127: episode_rewards = 28.0, policy_loss = 270.43048095703125, value_loss = 7019.64892578125
128: episode_rewards = 14.0, policy_loss = 76.87844848632812, value_loss = 1107.720458984375
129: episode_rewards = 14.0, policy_loss = 75.80248260498047, value_loss = 1101.1683349609375
130: episode_rewards = 19.0, policy_loss = 132.5284881591797, value_loss = 2488.318359375
131: episode_rewards = 19.0, policy_loss = 135.30047607421875, value_loss = 2482.94091796875
132: episode_rewards = 29.0, policy_loss = 289.4839782714844, value_loss = 7664.759765625
133: episode_rewards = 17.0, policy_loss = 112.15398406982422, value_loss = 1839.5714111328125
134: episode_rewards = 15.0, policy_loss = 88.90082550048828, value_loss = 1311.5439453125
135: episode_rewards = 15.0, policy_loss = 91.98249816894531, value_loss = 1308.

[2017-11-27 23:04:17,073] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000216.mp4


209: episode_rewards = 17.0, policy_loss = 106.08619689941406, value_loss = 1770.837646484375
210: episode_rewards = 18.0, policy_loss = 120.36920928955078, value_loss = 2039.8350830078125
211: episode_rewards = 12.0, policy_loss = 63.849666595458984, value_loss = 659.4358520507812
212: episode_rewards = 18.0, policy_loss = 116.78791809082031, value_loss = 2034.849365234375
213: episode_rewards = 11.0, policy_loss = 49.28379440307617, value_loss = 538.8980712890625
214: episode_rewards = 26.0, policy_loss = 231.0035400390625, value_loss = 5574.16796875
215: episode_rewards = 21.0, policy_loss = 155.11587524414062, value_loss = 3137.178955078125




216: episode_rewards = 40.0, policy_loss = 497.62939453125, value_loss = 17593.796875
217: episode_rewards = 57.0, policy_loss = 945.3668212890625, value_loss = 44217.14453125
218: episode_rewards = 21.0, policy_loss = 154.9577178955078, value_loss = 3108.844970703125
219: episode_rewards = 26.0, policy_loss = 228.23179626464844, value_loss = 5556.5048828125
220: episode_rewards = 16.0, policy_loss = 95.03447723388672, value_loss = 1495.0439453125
221: episode_rewards = 26.0, policy_loss = 228.67918395996094, value_loss = 5509.52392578125
222: episode_rewards = 21.0, policy_loss = 153.35743713378906, value_loss = 3129.417724609375
223: episode_rewards = 20.0, policy_loss = 139.11251831054688, value_loss = 2734.933837890625
224: episode_rewards = 17.0, policy_loss = 104.7176513671875, value_loss = 1751.6944580078125
225: episode_rewards = 31.0, policy_loss = 313.08160400390625, value_loss = 8868.1142578125
226: episode_rewards = 10.0, policy_loss = 41.415096282958984, value_loss = 404.2

306: episode_rewards = 53.0, policy_loss = 823.3262329101562, value_loss = 35711.75
307: episode_rewards = 45.0, policy_loss = 594.600830078125, value_loss = 23358.310546875
308: episode_rewards = 28.0, policy_loss = 253.91128540039062, value_loss = 6516.7177734375
309: episode_rewards = 77.0, policy_loss = 1576.7615966796875, value_loss = 91965.875
310: episode_rewards = 23.0, policy_loss = 174.35679626464844, value_loss = 3825.171875
311: episode_rewards = 16.0, policy_loss = 98.76451873779297, value_loss = 1367.4127197265625
312: episode_rewards = 62.0, policy_loss = 1070.3935546875, value_loss = 53524.13671875
313: episode_rewards = 14.0, policy_loss = 69.63106536865234, value_loss = 938.92724609375
314: episode_rewards = 20.0, policy_loss = 138.63719177246094, value_loss = 2554.320556640625
315: episode_rewards = 28.0, policy_loss = 248.41342163085938, value_loss = 6510.31494140625
316: episode_rewards = 49.0, policy_loss = 694.5638427734375, value_loss = 29122.392578125
317: epis

[2017-11-27 23:04:22,737] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000343.mp4


341: episode_rewards = 46.0, policy_loss = 601.8141479492188, value_loss = 24353.27734375
342: episode_rewards = 36.0, policy_loss = 389.9230041503906, value_loss = 12686.634765625




343: episode_rewards = 38.0, policy_loss = 438.1608581542969, value_loss = 14612.0244140625
344: episode_rewards = 52.0, policy_loss = 764.287841796875, value_loss = 33593.671875
345: episode_rewards = 62.0, policy_loss = 1077.2249755859375, value_loss = 52790.21484375
346: episode_rewards = 16.0, policy_loss = 87.27363586425781, value_loss = 1333.8175048828125
347: episode_rewards = 47.0, policy_loss = 655.2194213867188, value_loss = 25701.140625
348: episode_rewards = 22.0, policy_loss = 156.07040405273438, value_loss = 3239.634765625
349: episode_rewards = 18.0, policy_loss = 119.74655151367188, value_loss = 1839.40625
350: episode_rewards = 37.0, policy_loss = 413.2478942871094, value_loss = 13621.74609375
351: episode_rewards = 49.0, policy_loss = 678.260986328125, value_loss = 28774.228515625
352: episode_rewards = 29.0, policy_loss = 264.3946228027344, value_loss = 6900.84228515625
353: episode_rewards = 27.0, policy_loss = 225.0675506591797, value_loss = 5707.36572265625
354: e

436: episode_rewards = 56.0, policy_loss = 849.531494140625, value_loss = 39909.02734375
437: episode_rewards = 34.0, policy_loss = 354.5043029785156, value_loss = 10445.255859375
438: episode_rewards = 42.0, policy_loss = 503.43316650390625, value_loss = 18586.806640625
439: episode_rewards = 21.0, policy_loss = 147.1937713623047, value_loss = 2696.1572265625
440: episode_rewards = 44.0, policy_loss = 536.603271484375, value_loss = 21040.990234375
441: episode_rewards = 52.0, policy_loss = 733.8985595703125, value_loss = 32520.744140625
442: episode_rewards = 20.0, policy_loss = 128.01742553710938, value_loss = 2356.491943359375
443: episode_rewards = 39.0, policy_loss = 427.223876953125, value_loss = 15053.974609375
444: episode_rewards = 22.0, policy_loss = 153.6121826171875, value_loss = 3043.9296875
445: episode_rewards = 56.0, policy_loss = 850.439453125, value_loss = 39570.85546875
446: episode_rewards = 26.0, policy_loss = 209.1680145263672, value_loss = 4734.4482421875
447: ep

[2017-11-27 23:04:30,451] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000512.mp4


508: episode_rewards = 53.0, policy_loss = 748.8604125976562, value_loss = 33394.35546875
509: episode_rewards = 11.0, policy_loss = 39.5349235534668, value_loss = 341.63909912109375
510: episode_rewards = 79.0, policy_loss = 1562.3243408203125, value_loss = 92948.9140625
511: episode_rewards = 27.0, policy_loss = 211.83370971679688, value_loss = 5288.83349609375




512: episode_rewards = 41.0, policy_loss = 464.9590759277344, value_loss = 16734.36328125
513: episode_rewards = 38.0, policy_loss = 419.6692199707031, value_loss = 13504.271484375
514: episode_rewards = 29.0, policy_loss = 228.746826171875, value_loss = 6417.48876953125
515: episode_rewards = 43.0, policy_loss = 491.3747863769531, value_loss = 19053.59375
516: episode_rewards = 11.0, policy_loss = 39.409263610839844, value_loss = 332.14190673828125
517: episode_rewards = 67.0, policy_loss = 1187.1192626953125, value_loss = 60850.08984375
518: episode_rewards = 28.0, policy_loss = 220.3985137939453, value_loss = 5657.56396484375
519: episode_rewards = 46.0, policy_loss = 574.6437377929688, value_loss = 22764.142578125
520: episode_rewards = 62.0, policy_loss = 987.9254150390625, value_loss = 49994.92578125
521: episode_rewards = 26.0, policy_loss = 187.7287139892578, value_loss = 4585.43212890625
522: episode_rewards = 33.0, policy_loss = 297.2423400878906, value_loss = 9072.728515625


606: episode_rewards = 58.0, policy_loss = 833.145751953125, value_loss = 40824.33203125
607: episode_rewards = 21.0, policy_loss = 130.98768615722656, value_loss = 2248.456787109375
608: episode_rewards = 56.0, policy_loss = 783.2951049804688, value_loss = 37311.0234375
609: episode_rewards = 70.0, policy_loss = 1184.9638671875, value_loss = 66359.3828125
610: episode_rewards = 16.0, policy_loss = 78.71940612792969, value_loss = 977.7740478515625
611: episode_rewards = 69.0, policy_loss = 1197.19873046875, value_loss = 63584.98828125
612: episode_rewards = 59.0, policy_loss = 864.4011840820312, value_loss = 42599.03515625
613: episode_rewards = 34.0, policy_loss = 298.4146423339844, value_loss = 9311.3740234375
614: episode_rewards = 62.0, policy_loss = 957.4479370117188, value_loss = 48242.61328125
615: episode_rewards = 60.0, policy_loss = 897.1814575195312, value_loss = 44347.9921875
616: episode_rewards = 42.0, policy_loss = 486.00146484375, value_loss = 16647.916015625
617: episo

700: episode_rewards = 32.0, policy_loss = 271.5933837890625, value_loss = 7317.021484375
701: episode_rewards = 47.0, policy_loss = 544.368408203125, value_loss = 21891.384765625
702: episode_rewards = 49.0, policy_loss = 578.7902221679688, value_loss = 24511.68359375
703: episode_rewards = 19.0, policy_loss = 99.107177734375, value_loss = 1518.087646484375
704: episode_rewards = 124.0, policy_loss = 3245.630615234375, value_loss = 259369.078125
705: episode_rewards = 32.0, policy_loss = 254.8053436279297, value_loss = 7222.4970703125
706: episode_rewards = 36.0, policy_loss = 339.6987609863281, value_loss = 10143.71484375
707: episode_rewards = 31.0, policy_loss = 239.6516571044922, value_loss = 6592.794921875
708: episode_rewards = 127.0, policy_loss = 3276.797607421875, value_loss = 274250.46875
709: episode_rewards = 29.0, policy_loss = 203.4124755859375, value_loss = 5366.72998046875
710: episode_rewards = 12.0, policy_loss = 42.212303161621094, value_loss = 329.67291259765625
71

[2017-11-27 23:04:42,296] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video000729.mp4


725: episode_rewards = 89.0, policy_loss = 1798.2196044921875, value_loss = 115813.1953125
726: episode_rewards = 28.0, policy_loss = 197.36737060546875, value_loss = 4541.3095703125
727: episode_rewards = 37.0, policy_loss = 347.49468994140625, value_loss = 10883.548828125
728: episode_rewards = 20.0, policy_loss = 104.02769470214844, value_loss = 1481.0845947265625




729: episode_rewards = 30.0, policy_loss = 223.3844757080078, value_loss = 5687.078125
730: episode_rewards = 16.0, policy_loss = 55.50937271118164, value_loss = 813.139892578125
731: episode_rewards = 41.0, policy_loss = 417.5216979980469, value_loss = 14315.2861328125
732: episode_rewards = 55.0, policy_loss = 785.9172973632812, value_loss = 32615.609375
733: episode_rewards = 28.0, policy_loss = 188.38101196289062, value_loss = 4652.61181640625
734: episode_rewards = 31.0, policy_loss = 229.23179626464844, value_loss = 6226.06689453125
735: episode_rewards = 88.0, policy_loss = 1748.03955078125, value_loss = 112991.34375
736: episode_rewards = 41.0, policy_loss = 440.6874084472656, value_loss = 13801.5947265625
737: episode_rewards = 73.0, policy_loss = 1250.917236328125, value_loss = 70058.6328125
738: episode_rewards = 28.0, policy_loss = 182.03945922851562, value_loss = 4654.0751953125
739: episode_rewards = 29.0, policy_loss = 217.23587036132812, value_loss = 5120.1025390625
740

823: episode_rewards = 44.0, policy_loss = 477.3830261230469, value_loss = 16268.0771484375
824: episode_rewards = 82.0, policy_loss = 1485.9241943359375, value_loss = 90254.796875
825: episode_rewards = 30.0, policy_loss = 200.26678466796875, value_loss = 5337.2958984375
826: episode_rewards = 64.0, policy_loss = 954.1470947265625, value_loss = 46756.2421875
827: episode_rewards = 62.0, policy_loss = 886.127197265625, value_loss = 43069.92578125
828: episode_rewards = 89.0, policy_loss = 1727.0499267578125, value_loss = 110903.5859375
829: episode_rewards = 56.0, policy_loss = 732.3971557617188, value_loss = 32153.396484375
830: episode_rewards = 21.0, policy_loss = 101.1202163696289, value_loss = 1668.6719970703125
831: episode_rewards = 56.0, policy_loss = 689.8947143554688, value_loss = 32822.109375
832: episode_rewards = 21.0, policy_loss = 95.4541244506836, value_loss = 1543.572021484375
833: episode_rewards = 57.0, policy_loss = 736.144287109375, value_loss = 33774.97265625
834:

916: episode_rewards = 39.0, policy_loss = 332.1866760253906, value_loss = 10134.4033203125
917: episode_rewards = 41.0, policy_loss = 353.9958801269531, value_loss = 12254.5166015625
918: episode_rewards = 33.0, policy_loss = 229.87318420410156, value_loss = 6296.951171875
919: episode_rewards = 37.0, policy_loss = 285.54229736328125, value_loss = 8928.4638671875
920: episode_rewards = 58.0, policy_loss = 711.4275512695312, value_loss = 32989.609375
921: episode_rewards = 96.0, policy_loss = 1960.4896240234375, value_loss = 125583.4921875
922: episode_rewards = 97.0, policy_loss = 1932.4112548828125, value_loss = 130020.953125
923: episode_rewards = 58.0, policy_loss = 726.578369140625, value_loss = 33180.7890625
924: episode_rewards = 37.0, policy_loss = 275.35894775390625, value_loss = 9015.986328125
925: episode_rewards = 14.0, policy_loss = 25.628267288208008, value_loss = 365.03216552734375
926: episode_rewards = 40.0, policy_loss = 321.34722900390625, value_loss = 11051.82226562

[2017-11-27 23:04:59,758] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.68446.video001000.mp4


996: episode_rewards = 74.0, policy_loss = 1084.1488037109375, value_loss = 60138.9453125
997: episode_rewards = 39.0, policy_loss = 302.07061767578125, value_loss = 9440.109375
998: episode_rewards = 24.0, policy_loss = 113.22563934326172, value_loss = 1781.8216552734375
999: episode_rewards = 26.0, policy_loss = 124.0686264038086, value_loss = 2425.11669921875




1000: episode_rewards = 42.0, policy_loss = 338.2337951660156, value_loss = 11852.255859375
1001: episode_rewards = 14.0, policy_loss = 14.212600708007812, value_loss = 323.1676025390625
1002: episode_rewards = 106.0, policy_loss = 2121.61376953125, value_loss = 154848.421875
1003: episode_rewards = 102.0, policy_loss = 1987.2374267578125, value_loss = 140460.0
1004: episode_rewards = 39.0, policy_loss = 287.4305725097656, value_loss = 9289.15625
1005: episode_rewards = 24.0, policy_loss = 99.23936462402344, value_loss = 1848.3311767578125
1006: episode_rewards = 20.0, policy_loss = 55.9886474609375, value_loss = 974.9815673828125
1007: episode_rewards = 53.0, policy_loss = 548.1070556640625, value_loss = 23365.234375
1008: episode_rewards = 43.0, policy_loss = 361.15081787109375, value_loss = 12471.42578125
1009: episode_rewards = 34.0, policy_loss = 234.97120666503906, value_loss = 5842.82373046875
1010: episode_rewards = 20.0, policy_loss = 50.75023651123047, value_loss = 990.221435

1095: episode_rewards = 47.0, policy_loss = 410.91802978515625, value_loss = 14403.33984375
1096: episode_rewards = 44.0, policy_loss = 335.499267578125, value_loss = 11881.1015625
1097: episode_rewards = 48.0, policy_loss = 398.9465026855469, value_loss = 15415.8115234375
1098: episode_rewards = 82.0, policy_loss = 1219.79541015625, value_loss = 74808.859375
1099: episode_rewards = 113.0, policy_loss = 2353.48828125, value_loss = 169784.25
1100: episode_rewards = 46.0, policy_loss = 385.3077392578125, value_loss = 13358.23828125
1101: episode_rewards = 36.0, policy_loss = 252.1839599609375, value_loss = 5936.80517578125
1102: episode_rewards = 47.0, policy_loss = 397.5440368652344, value_loss = 14984.8828125
1103: episode_rewards = 35.0, policy_loss = 183.484619140625, value_loss = 5496.2841796875
1104: episode_rewards = 51.0, policy_loss = 463.27008056640625, value_loss = 19081.0546875
1105: episode_rewards = 59.0, policy_loss = 645.796630859375, value_loss = 29555.78125
1106: episod

1188: episode_rewards = 70.0, policy_loss = 858.6215209960938, value_loss = 41970.609375
1189: episode_rewards = 81.0, policy_loss = 1218.7266845703125, value_loss = 64618.50390625
1190: episode_rewards = 84.0, policy_loss = 1300.601318359375, value_loss = 72780.1015625
1191: episode_rewards = 52.0, policy_loss = 430.0082702636719, value_loss = 16836.080078125
1192: episode_rewards = 77.0, policy_loss = 1103.7216796875, value_loss = 55225.37109375
1193: episode_rewards = 16.0, policy_loss = -18.32070541381836, value_loss = 537.5418701171875
1194: episode_rewards = 61.0, policy_loss = 593.8093872070312, value_loss = 28861.701171875
1195: episode_rewards = 164.0, policy_loss = 4098.841796875, value_loss = 384722.3125
1196: episode_rewards = 59.0, policy_loss = 604.5032958984375, value_loss = 25118.431640625
1197: episode_rewards = 51.0, policy_loss = 433.1414489746094, value_loss = 16858.365234375
1198: episode_rewards = 23.0, policy_loss = 31.120866775512695, value_loss = 963.1351928710

1281: episode_rewards = 77.0, policy_loss = 951.8681640625, value_loss = 52237.046875
1282: episode_rewards = 83.0, policy_loss = 1093.423095703125, value_loss = 64106.46484375
1283: episode_rewards = 60.0, policy_loss = 558.0509643554688, value_loss = 24892.3671875
1284: episode_rewards = 38.0, policy_loss = 161.73977661132812, value_loss = 4761.2138671875
1285: episode_rewards = 47.0, policy_loss = 280.74053955078125, value_loss = 10879.8466796875
1286: episode_rewards = 39.0, policy_loss = 202.54025268554688, value_loss = 6023.2490234375
1287: episode_rewards = 56.0, policy_loss = 430.0687255859375, value_loss = 19778.341796875
1288: episode_rewards = 95.0, policy_loss = 1532.3143310546875, value_loss = 94067.875
1289: episode_rewards = 58.0, policy_loss = 479.8298645019531, value_loss = 21376.1171875
1290: episode_rewards = 69.0, policy_loss = 737.4441528320312, value_loss = 37017.9453125
1291: episode_rewards = 72.0, policy_loss = 884.3441162109375, value_loss = 41607.27734375
129

1373: episode_rewards = 97.0, policy_loss = 1372.9014892578125, value_loss = 87970.203125
1374: episode_rewards = 137.0, policy_loss = 2739.2451171875, value_loss = 215974.46875
1375: episode_rewards = 78.0, policy_loss = 841.3056640625, value_loss = 45976.734375
1376: episode_rewards = 88.0, policy_loss = 1172.1522216796875, value_loss = 65213.625
1377: episode_rewards = 65.0, policy_loss = 537.1765747070312, value_loss = 26316.275390625
1378: episode_rewards = 107.0, policy_loss = 1670.4036865234375, value_loss = 113502.109375
1379: episode_rewards = 78.0, policy_loss = 847.9580078125, value_loss = 45947.078125
1380: episode_rewards = 68.0, policy_loss = 615.23583984375, value_loss = 29659.140625
1381: episode_rewards = 73.0, policy_loss = 767.015869140625, value_loss = 36616.78515625
1382: episode_rewards = 124.0, policy_loss = 2287.263671875, value_loss = 166381.84375
1383: episode_rewards = 159.0, policy_loss = 3487.666015625, value_loss = 307246.375
1384: episode_rewards = 124.0,

1467: episode_rewards = 176.0, policy_loss = 3817.246337890625, value_loss = 335465.875
1468: episode_rewards = 194.0, policy_loss = 4468.34716796875, value_loss = 420616.875
1469: episode_rewards = 152.0, policy_loss = 2902.714599609375, value_loss = 232863.828125
1470: episode_rewards = 200.0, policy_loss = 4732.52978515625, value_loss = 438680.0625
1471: episode_rewards = 200.0, policy_loss = 5046.55859375, value_loss = 416437.28125
1472: episode_rewards = 122.0, policy_loss = 1769.652587890625, value_loss = 126042.3984375
1473: episode_rewards = 94.0, policy_loss = 999.6367797851562, value_loss = 62185.01171875
1474: episode_rewards = 176.0, policy_loss = 3700.847412109375, value_loss = 320582.0625
1475: episode_rewards = 200.0, policy_loss = 4827.9482421875, value_loss = 438721.09375
1476: episode_rewards = 200.0, policy_loss = 4746.32080078125, value_loss = 429785.28125
1477: episode_rewards = 50.0, policy_loss = 66.2519302368164, value_loss = 9332.4580078125
1478: episode_reward

1560: episode_rewards = 197.0, policy_loss = 4263.71923828125, value_loss = 367148.5625
1561: episode_rewards = 200.0, policy_loss = 4002.35400390625, value_loss = 358555.71875
1562: episode_rewards = 45.0, policy_loss = -63.61740493774414, value_loss = 5282.55908203125
1563: episode_rewards = 200.0, policy_loss = 4533.7109375, value_loss = 372876.53125
1564: episode_rewards = 200.0, policy_loss = 4206.65576171875, value_loss = 349996.25
1565: episode_rewards = 200.0, policy_loss = 4320.39990234375, value_loss = 377340.8125
1566: episode_rewards = 200.0, policy_loss = 4178.75, value_loss = 339471.5625
1567: episode_rewards = 183.0, policy_loss = 3637.634033203125, value_loss = 306254.0
1568: episode_rewards = 200.0, policy_loss = 3922.526123046875, value_loss = 348733.71875
1569: episode_rewards = 105.0, policy_loss = 825.556884765625, value_loss = 78585.8125
1570: episode_rewards = 200.0, policy_loss = 4019.359375, value_loss = 351510.21875
1571: episode_rewards = 200.0, policy_loss =

1653: episode_rewards = 179.0, policy_loss = 3018.573486328125, value_loss = 251889.0625
1654: episode_rewards = 129.0, policy_loss = 1457.8017578125, value_loss = 103032.375
1655: episode_rewards = 133.0, policy_loss = 1636.1781005859375, value_loss = 106409.0625
1656: episode_rewards = 200.0, policy_loss = 3724.837158203125, value_loss = 321337.3125
1657: episode_rewards = 200.0, policy_loss = 3904.8544921875, value_loss = 324929.84375
1658: episode_rewards = 117.0, policy_loss = 1112.1968994140625, value_loss = 75732.625
1659: episode_rewards = 200.0, policy_loss = 3847.247802734375, value_loss = 320986.6875
1660: episode_rewards = 200.0, policy_loss = 3830.60302734375, value_loss = 291309.15625
1661: episode_rewards = 124.0, policy_loss = 1309.5904541015625, value_loss = 92160.5703125
1662: episode_rewards = 200.0, policy_loss = 3477.943603515625, value_loss = 287137.71875
1663: episode_rewards = 160.0, policy_loss = 2280.896728515625, value_loss = 189788.609375
1664: episode_rewar

1747: episode_rewards = 200.0, policy_loss = 3057.15087890625, value_loss = 267577.71875
1748: episode_rewards = 131.0, policy_loss = 957.1151123046875, value_loss = 84220.3203125
1749: episode_rewards = 170.0, policy_loss = 2406.108642578125, value_loss = 184939.359375
1750: episode_rewards = 200.0, policy_loss = 2918.9091796875, value_loss = 269667.34375
1751: episode_rewards = 66.0, policy_loss = -252.48184204101562, value_loss = 24140.52734375
1752: episode_rewards = 200.0, policy_loss = 3193.734375, value_loss = 281418.0625
1753: episode_rewards = 174.0, policy_loss = 2507.9892578125, value_loss = 199258.109375
1754: episode_rewards = 200.0, policy_loss = 2877.37548828125, value_loss = 257013.875
1755: episode_rewards = 200.0, policy_loss = 2658.893310546875, value_loss = 273701.59375
1756: episode_rewards = 200.0, policy_loss = 3463.37939453125, value_loss = 272518.1875
1757: episode_rewards = 200.0, policy_loss = 3121.762451171875, value_loss = 264457.375
1758: episode_rewards =

1840: episode_rewards = 200.0, policy_loss = 2930.0869140625, value_loss = 231313.75
1841: episode_rewards = 161.0, policy_loss = 1870.2764892578125, value_loss = 130757.875
1842: episode_rewards = 200.0, policy_loss = 2503.6259765625, value_loss = 209938.421875
1843: episode_rewards = 146.0, policy_loss = 1299.4935302734375, value_loss = 105215.8828125
1844: episode_rewards = 200.0, policy_loss = 2846.998779296875, value_loss = 224342.828125
1845: episode_rewards = 200.0, policy_loss = 2796.6220703125, value_loss = 238889.265625
1846: episode_rewards = 200.0, policy_loss = 2540.490478515625, value_loss = 225995.390625
1847: episode_rewards = 200.0, policy_loss = 2463.93359375, value_loss = 218327.390625
1848: episode_rewards = 164.0, policy_loss = 1830.203125, value_loss = 142117.171875
1849: episode_rewards = 164.0, policy_loss = 1832.0919189453125, value_loss = 140025.125
1850: episode_rewards = 200.0, policy_loss = 2622.5205078125, value_loss = 214697.234375
1851: episode_rewards =

1932: episode_rewards = 174.0, policy_loss = 1710.78173828125, value_loss = 142564.578125
1933: episode_rewards = 200.0, policy_loss = 2364.378662109375, value_loss = 200869.53125
1934: episode_rewards = 184.0, policy_loss = 1411.1220703125, value_loss = 166932.46875
1935: episode_rewards = 200.0, policy_loss = 2254.45166015625, value_loss = 196470.703125
1936: episode_rewards = 128.0, policy_loss = 526.2305908203125, value_loss = 60828.68359375
1937: episode_rewards = 196.0, policy_loss = 1792.469970703125, value_loss = 189780.1875
1938: episode_rewards = 195.0, policy_loss = 2387.105712890625, value_loss = 187296.4375
1939: episode_rewards = 200.0, policy_loss = 2091.43701171875, value_loss = 183030.921875
1940: episode_rewards = 46.0, policy_loss = -583.773193359375, value_loss = 19446.439453125
1941: episode_rewards = 200.0, policy_loss = 2370.7001953125, value_loss = 180984.84375
1942: episode_rewards = 200.0, policy_loss = 2176.4560546875, value_loss = 186872.015625
1943: episode

In [None]:
env.render(close=True)
env.close()

In [None]:
env = gym.make('Pendulum-v0')

In [None]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.shape[0]
print('input_size = {0}, output_size = {1}'.format(input_size, output_size))

In [None]:
model = ActorCritic(input_size, output_size)
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# vanilla policy gradient
for i in range(2000):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  
  values = []
  logprobs = []
  rewards = []

  while not done:
     # forward propagation on policy using current_state
    value, action_real = model(Variable(torch.from_numpy(current_state).float().unsqueeze(0)))
    break
    # value and action_real are variables
#     action_logprob = F.log_softmax(action_real) # returns a variable
    
#     action_prob = F.softmax(action_real) # returns a variable
#     action = action_prob.multinomial().data # returns a torch tensor
    
#     logprob = action_logprob.gather(1, Variable(action))
#     current_state, reward, done, _ = env.step(action.numpy()[0,0])
    
#     values.append(value) # variable
#     logprobs.append(logprob) # variable
#     rewards.append(reward) # numpy
    
#     episode_reward += reward
  
#   R = 0
#   value_loss = 0
#   policy_loss = 0
#   for j in reversed(range(len(rewards))):
#     R = rewards[j] + gamma * R # numpy
#     advantage = R - values[j]
#     value_loss += advantage.pow(2)
#     policy_loss -= logprobs[j] * advantage
  
#   optimizer.zero_grad()
#   value_loss.backward(retain_variables=True)
#   policy_loss.backward()
#   optimizer.step()
  
#   print('{0}: episode_rewards = {1}, policy_loss = {2}, value_loss = {3}'.format(
#     i, episode_reward, policy_loss.data[0,0], value_loss.data[0,0]))

In [None]:
F.tanh(action_real)