In [1]:
import gym
from gym import wrappers
import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from collections import namedtuple
from torch.autograd import Variable
import random
import numpy as np
from collections import deque

In [2]:
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, 'cartpole', force=True)

[2017-08-11 15:07:03,507] Making new env: CartPole-v0


In [3]:
class Policy(nn.Module):
  def __init__(self, input_size, output_size):
    super(Policy, self).__init__() # this statement is always needed
    
    self.fc1 = nn.Linear(input_size, 10) # matrix multiplication
    self.fc2 = nn.Linear(10, output_size) # matrix multiplication
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    # =============================== 
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [4]:
class ValueFunction(nn.Module):
  def __init__(self, input_size):
    super(ValueFunction, self).__init__()
    
    self.fc1 = nn.Linear(input_size, 10)
    self.fc2 = nn.Linear(10, 1) # a single value
    
    # == parameters initialization ==
    nn.init.xavier_normal(self.fc1.weight)
    nn.init.xavier_normal(self.fc2.weight)
    
    nn.init.normal(self.fc1.bias)
    nn.init.normal(self.fc2.bias)
    # ===============================
    
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [5]:
class ActorCritic(nn.Module):
  def __init__(self, input_size, output_size):
    super(ActorCritic, self).__init__()
    self.actor = Policy(input_size, output_size)
    self.critic = ValueFunction(input_size)

  def forward(self, x):
    return self.critic(x), self.actor(x) # value, action

In [6]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
print('input_size = {0}, output_size = {1}'.format(input_size, output_size))

input_size = 4, output_size = 2


In [7]:
gamma = 0.99 # the parameter for discounting future rewards
lbda = 0.9 # the parameter for GAE (generalized advantage estimation)

In [8]:
model = ActorCritic(input_size, output_size)
optimizer = torch.optim.Adam(model.parameters())

In [10]:
# vanilla policy gradient
for i in range(2000):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  
  values = [] 
  logprobs = []
  rewards = []

  while not done:
     # forward propagation on policy using current_state
    value, action_real = model(Variable(torch.from_numpy(current_state).float().unsqueeze(0))) 
    # value and action_real are variables
    action_logprob = F.log_softmax(action_real) # returns a variable
    
    action_prob = F.softmax(action_real) # returns a variable
    action = action_prob.multinomial().data # returns a torch tensor
    
    logprob = action_logprob.gather(1, Variable(action))
    current_state, reward, done, _ = env.step(action.numpy()[0,0])
    
    values.append(value) # variable
    logprobs.append(logprob) # variable
    rewards.append(reward) # numpy
    
    episode_reward += reward
  
  R = 0
  value_loss = 0
  policy_loss = 0
  for j in reversed(range(len(rewards))):
    R = rewards[j] + gamma * R # numpy
    advantage = R - values[j]
    value_loss += advantage.pow(2)
    policy_loss -= logprobs[j] * advantage
  
  optimizer.zero_grad()
  value_loss.backward(retain_variables=True)
  policy_loss.backward()
  optimizer.step()
  
  print('{0}: episode_rewards = {1}, policy_loss = {2}, value_loss = {3}'.format(
    i, episode_reward, policy_loss.data[0,0], value_loss.data[0,0]))

[2017-08-11 15:07:56,226] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000001.mp4
[2017-08-11 15:07:56,379] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000008.mp4


0: episode_rewards = 9.0, policy_loss = 0.4540056884288788, value_loss = 202.0786590576172
1: episode_rewards = 9.0, policy_loss = 0.5151804089546204, value_loss = 203.80227661132812
2: episode_rewards = 11.0, policy_loss = 0.5992369055747986, value_loss = 357.1293640136719
3: episode_rewards = 9.0, policy_loss = 0.4409686326980591, value_loss = 198.54904174804688
4: episode_rewards = 9.0, policy_loss = 0.48016950488090515, value_loss = 200.05174255371094
5: episode_rewards = 9.0, policy_loss = 0.47633662819862366, value_loss = 197.91897583007812
6: episode_rewards = 8.0, policy_loss = 0.3755933940410614, value_loss = 141.96722412109375
7: episode_rewards = 10.0, policy_loss = 0.4205549955368042, value_loss = 261.80126953125
8: episode_rewards = 9.0, policy_loss = 0.4100188612937927, value_loss = 196.6254119873047
9: episode_rewards = 10.0, policy_loss = 0.42579180002212524, value_loss = 260.97613525390625
10: episode_rewards = 11.0, policy_loss = 0.5124562978744507, value_loss = 345.3

[2017-08-11 15:07:56,606] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000027.mp4


12: episode_rewards = 9.0, policy_loss = 0.35967376828193665, value_loss = 189.41232299804688
13: episode_rewards = 10.0, policy_loss = 0.39962002635002136, value_loss = 257.19232177734375
14: episode_rewards = 10.0, policy_loss = 0.38829341530799866, value_loss = 254.92788696289062
15: episode_rewards = 8.0, policy_loss = 0.28707417845726013, value_loss = 134.04171752929688
16: episode_rewards = 9.0, policy_loss = 0.33131369948387146, value_loss = 186.01638793945312
17: episode_rewards = 8.0, policy_loss = 0.31050679087638855, value_loss = 132.65701293945312
18: episode_rewards = 10.0, policy_loss = 0.377653568983078, value_loss = 250.53289794921875
19: episode_rewards = 10.0, policy_loss = 0.37164852023124695, value_loss = 251.1102294921875
20: episode_rewards = 9.0, policy_loss = 0.30034303665161133, value_loss = 183.07298278808594
21: episode_rewards = 10.0, policy_loss = 0.35380735993385315, value_loss = 249.25430297851562
22: episode_rewards = 9.0, policy_loss = 0.294263452291488

[2017-08-11 15:07:56,912] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000064.mp4



28: episode_rewards = 9.0, policy_loss = 0.30135947465896606, value_loss = 177.0072479248047
29: episode_rewards = 8.0, policy_loss = 0.2114933878183365, value_loss = 125.60525512695312
30: episode_rewards = 9.0, policy_loss = 0.26959824562072754, value_loss = 177.958984375
31: episode_rewards = 9.0, policy_loss = 0.2622501254081726, value_loss = 176.84683227539062
32: episode_rewards = 10.0, policy_loss = 0.28738367557525635, value_loss = 236.89840698242188
33: episode_rewards = 9.0, policy_loss = 0.2330101728439331, value_loss = 173.76068115234375
34: episode_rewards = 10.0, policy_loss = 0.298147976398468, value_loss = 236.4781494140625
35: episode_rewards = 9.0, policy_loss = 0.25410690903663635, value_loss = 172.66038513183594
36: episode_rewards = 9.0, policy_loss = 0.21273139119148254, value_loss = 171.84352111816406
37: episode_rewards = 8.0, policy_loss = 0.20066721737384796, value_loss = 123.21957397460938
38: episode_rewards = 10.0, policy_loss = 0.2633245587348938, value_l

[2017-08-11 15:07:57,327] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000125.mp4


102: episode_rewards = 9.0, policy_loss = 0.1615249514579773, value_loss = 142.93869018554688
103: episode_rewards = 13.0, policy_loss = 50.299652099609375, value_loss = 485.093994140625
104: episode_rewards = 10.0, policy_loss = 0.21195320785045624, value_loss = 198.5533447265625
105: episode_rewards = 9.0, policy_loss = 0.15885542333126068, value_loss = 142.76284790039062
106: episode_rewards = 11.0, policy_loss = 0.2233603298664093, value_loss = 264.0372619628906
107: episode_rewards = 10.0, policy_loss = 0.22328034043312073, value_loss = 194.66659545898438
108: episode_rewards = 9.0, policy_loss = 0.1716698408126831, value_loss = 142.4377899169922
109: episode_rewards = 9.0, policy_loss = 0.17298290133476257, value_loss = 139.60752868652344
110: episode_rewards = 10.0, policy_loss = 0.2058373987674713, value_loss = 193.838623046875
111: episode_rewards = 9.0, policy_loss = 0.1771107316017151, value_loss = 140.95672607421875
112: episode_rewards = 8.0, policy_loss = 0.16253238916397

[2017-08-11 15:07:57,921] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000216.mp4


198: episode_rewards = 9.0, policy_loss = 0.18562684953212738, value_loss = 111.95380401611328
199: episode_rewards = 9.0, policy_loss = 0.18265585601329803, value_loss = 111.78892517089844
200: episode_rewards = 9.0, policy_loss = 0.1503296196460724, value_loss = 108.98908996582031
201: episode_rewards = 9.0, policy_loss = 0.17541861534118652, value_loss = 109.03046417236328
202: episode_rewards = 10.0, policy_loss = 0.21345223486423492, value_loss = 154.6090545654297
203: episode_rewards = 9.0, policy_loss = 0.17549756169319153, value_loss = 109.6591796875
204: episode_rewards = 11.0, policy_loss = 0.2544634938240051, value_loss = 212.361572265625
205: episode_rewards = 9.0, policy_loss = 0.17249943315982819, value_loss = 108.81248474121094
206: episode_rewards = 10.0, policy_loss = 0.20456941425800323, value_loss = 152.26715087890625
207: episode_rewards = 10.0, policy_loss = 0.20303955674171448, value_loss = 153.03799438476562
208: episode_rewards = 10.0, policy_loss = 0.2255731523

291: episode_rewards = 8.0, policy_loss = 0.1101057380437851, value_loss = 56.327964782714844
292: episode_rewards = 10.0, policy_loss = 0.1705368161201477, value_loss = 123.33531951904297
293: episode_rewards = 9.0, policy_loss = 0.1218762993812561, value_loss = 84.5706787109375
294: episode_rewards = 9.0, policy_loss = 0.15145981311798096, value_loss = 84.57569885253906
295: episode_rewards = 10.0, policy_loss = 0.16924329102039337, value_loss = 122.306640625
296: episode_rewards = 9.0, policy_loss = 0.13977868854999542, value_loss = 84.16154479980469
297: episode_rewards = 13.0, policy_loss = 42.309913635253906, value_loss = 320.4197692871094
298: episode_rewards = 10.0, policy_loss = 0.17925119400024414, value_loss = 121.75326538085938
299: episode_rewards = 9.0, policy_loss = 0.12823215126991272, value_loss = 82.8240737915039
300: episode_rewards = 10.0, policy_loss = 0.1771213263273239, value_loss = 120.41238403320312
301: episode_rewards = 10.0, policy_loss = 0.15436775982379913

[2017-08-11 15:07:58,714] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000343.mp4


331: episode_rewards = 8.0, policy_loss = 0.09785627573728561, value_loss = 48.759117126464844
332: episode_rewards = 9.0, policy_loss = 0.1335420161485672, value_loss = 74.58706665039062
333: episode_rewards = 9.0, policy_loss = 0.13665851950645447, value_loss = 74.54362487792969
334: episode_rewards = 10.0, policy_loss = 0.17343175411224365, value_loss = 109.02214050292969
335: episode_rewards = 10.0, policy_loss = 0.1929684281349182, value_loss = 109.61558532714844
336: episode_rewards = 9.0, policy_loss = 0.15861740708351135, value_loss = 74.68222045898438
337: episode_rewards = 10.0, policy_loss = 0.1746133267879486, value_loss = 108.8531494140625
338: episode_rewards = 10.0, policy_loss = 0.15586714446544647, value_loss = 107.24420166015625
339: episode_rewards = 9.0, policy_loss = 0.1428074687719345, value_loss = 73.53034973144531
340: episode_rewards = 9.0, policy_loss = 0.14802271127700806, value_loss = 73.65005493164062
341: episode_rewards = 9.0, policy_loss = 0.144950121641

420: episode_rewards = 10.0, policy_loss = 0.1307719349861145, value_loss = 82.88251495361328
421: episode_rewards = 10.0, policy_loss = 22.032005310058594, value_loss = 85.9354248046875
422: episode_rewards = 11.0, policy_loss = 28.898014068603516, value_loss = 123.08302307128906
423: episode_rewards = 10.0, policy_loss = 0.12528520822525024, value_loss = 81.23474884033203
424: episode_rewards = 10.0, policy_loss = 0.13285917043685913, value_loss = 81.4316177368164
425: episode_rewards = 10.0, policy_loss = 0.15743470191955566, value_loss = 82.78907012939453
426: episode_rewards = 10.0, policy_loss = 0.15846925973892212, value_loss = 81.7868423461914
427: episode_rewards = 8.0, policy_loss = 0.08466783165931702, value_loss = 34.33065414428711
428: episode_rewards = 8.0, policy_loss = 0.07445204257965088, value_loss = 33.912147521972656
429: episode_rewards = 9.0, policy_loss = 0.10282236337661743, value_loss = 52.529422760009766
430: episode_rewards = 9.0, policy_loss = 0.109966188669

[2017-08-11 15:07:59,733] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000512.mp4


500: episode_rewards = 11.0, policy_loss = 0.178320974111557, value_loss = 94.68911743164062
501: episode_rewards = 9.0, policy_loss = 0.11044702678918839, value_loss = 39.6640625
502: episode_rewards = 9.0, policy_loss = 0.12112610042095184, value_loss = 40.02281188964844
503: episode_rewards = 9.0, policy_loss = 0.12463869154453278, value_loss = 39.83607864379883
504: episode_rewards = 10.0, policy_loss = 0.1525120735168457, value_loss = 62.42292785644531
505: episode_rewards = 10.0, policy_loss = 0.1326625943183899, value_loss = 61.59510803222656
506: episode_rewards = 9.0, policy_loss = 0.11285839974880219, value_loss = 38.58738327026367
507: episode_rewards = 9.0, policy_loss = 0.10628862679004669, value_loss = 38.80278778076172
508: episode_rewards = 9.0, policy_loss = 0.09652441740036011, value_loss = 38.07807922363281
509: episode_rewards = 9.0, policy_loss = 0.11269187927246094, value_loss = 38.28588104248047
510: episode_rewards = 9.0, policy_loss = 0.10993184894323349, value

591: episode_rewards = 10.0, policy_loss = 0.12857547402381897, value_loss = 43.28618240356445
592: episode_rewards = 10.0, policy_loss = 0.1174580454826355, value_loss = 42.37959289550781
593: episode_rewards = 10.0, policy_loss = 0.10533970594406128, value_loss = 40.517154693603516
594: episode_rewards = 8.0, policy_loss = 0.049192652106285095, value_loss = 15.572514533996582
595: episode_rewards = 11.0, policy_loss = 0.19144956767559052, value_loss = 72.576171875
596: episode_rewards = 9.0, policy_loss = 0.09525096416473389, value_loss = 25.186521530151367
597: episode_rewards = 10.0, policy_loss = 0.12050044536590576, value_loss = 41.765743255615234
598: episode_rewards = 9.0, policy_loss = 0.08391314744949341, value_loss = 24.672622680664062
599: episode_rewards = 9.0, policy_loss = 0.1033593937754631, value_loss = 25.281436920166016
600: episode_rewards = 9.0, policy_loss = 0.08817292749881744, value_loss = 24.13471221923828
601: episode_rewards = 9.0, policy_loss = 0.07571342587

[2017-08-11 15:08:01,015] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video000729.mp4


709: episode_rewards = 10.0, policy_loss = 11.820038795471191, value_loss = 26.94593620300293
710: episode_rewards = 9.0, policy_loss = 0.041131049394607544, value_loss = 13.043721199035645
711: episode_rewards = 8.0, policy_loss = 0.006825835444033146, value_loss = 10.080507278442383
712: episode_rewards = 9.0, policy_loss = 0.0543476939201355, value_loss = 13.2847900390625
713: episode_rewards = 10.0, policy_loss = 0.10250270366668701, value_loss = 23.370807647705078
714: episode_rewards = 8.0, policy_loss = 0.004435541108250618, value_loss = 10.00921630859375
715: episode_rewards = 9.0, policy_loss = 0.06208522617816925, value_loss = 13.187368392944336
716: episode_rewards = 9.0, policy_loss = 0.05311362445354462, value_loss = 12.30436897277832
717: episode_rewards = 10.0, policy_loss = 0.09364919364452362, value_loss = 22.147611618041992
718: episode_rewards = 9.0, policy_loss = 0.0513799712061882, value_loss = 12.316869735717773
719: episode_rewards = 10.0, policy_loss = 0.1108124

805: episode_rewards = 9.0, policy_loss = 0.04022090509533882, value_loss = 6.668889999389648
806: episode_rewards = 10.0, policy_loss = 0.09092971682548523, value_loss = 15.354763984680176
807: episode_rewards = 8.0, policy_loss = -0.017855968326330185, value_loss = 6.555120468139648
808: episode_rewards = 9.0, policy_loss = 0.03314086049795151, value_loss = 6.572776794433594
809: episode_rewards = 10.0, policy_loss = 0.08020272850990295, value_loss = 14.674578666687012
810: episode_rewards = 9.0, policy_loss = 0.04798110947012901, value_loss = 6.574707984924316
811: episode_rewards = 9.0, policy_loss = 0.038652338087558746, value_loss = 6.323940277099609
812: episode_rewards = 9.0, policy_loss = 0.036297574639320374, value_loss = 6.333554744720459
813: episode_rewards = 10.0, policy_loss = -0.944423258304596, value_loss = 16.601478576660156
814: episode_rewards = 10.0, policy_loss = 0.06900979578495026, value_loss = 12.947423934936523
815: episode_rewards = 9.0, policy_loss = 0.04613

917: episode_rewards = 8.0, policy_loss = -0.032510098069906235, value_loss = 4.880565166473389
918: episode_rewards = 10.0, policy_loss = 0.07347722351551056, value_loss = 8.443977355957031
919: episode_rewards = 9.0, policy_loss = 0.01811615377664566, value_loss = 2.4130477905273438
920: episode_rewards = 10.0, policy_loss = 0.06732339411973953, value_loss = 8.201104164123535
921: episode_rewards = 9.0, policy_loss = 0.028997106477618217, value_loss = 2.643036365509033
922: episode_rewards = 12.0, policy_loss = 14.126791954040527, value_loss = 39.33110046386719
923: episode_rewards = 9.0, policy_loss = 0.019100189208984375, value_loss = 2.3054168224334717
924: episode_rewards = 10.0, policy_loss = 0.0703703761100769, value_loss = 8.395344734191895
925: episode_rewards = 9.0, policy_loss = 0.02358013018965721, value_loss = 2.368974447250366
926: episode_rewards = 9.0, policy_loss = 0.017845746129751205, value_loss = 2.567793846130371
927: episode_rewards = 10.0, policy_loss = 0.072272

[2017-08-11 15:08:02,550] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video001000.mp4


998: episode_rewards = 10.0, policy_loss = 0.057327475398778915, value_loss = 4.449995040893555
999: episode_rewards = 10.0, policy_loss = 0.06811793893575668, value_loss = 5.572504043579102
1000: episode_rewards = 10.0, policy_loss = 0.0781734436750412, value_loss = 6.145407676696777
1001: episode_rewards = 9.0, policy_loss = -0.00815085880458355, value_loss = 1.912166953086853
1002: episode_rewards = 8.0, policy_loss = -0.0682196095585823, value_loss = 5.273784637451172
1003: episode_rewards = 8.0, policy_loss = -0.0636446475982666, value_loss = 5.276512145996094
1004: episode_rewards = 9.0, policy_loss = -0.004672822542488575, value_loss = 1.7582513093948364
1005: episode_rewards = 9.0, policy_loss = -0.0035434020683169365, value_loss = 1.605814814567566
1006: episode_rewards = 9.0, policy_loss = 0.0018870895728468895, value_loss = 1.6293941736221313
1007: episode_rewards = 10.0, policy_loss = 0.05738202482461929, value_loss = 4.223491668701172
1008: episode_rewards = 10.0, policy_l

1091: episode_rewards = 9.0, policy_loss = -0.04525022208690643, value_loss = 1.8879729509353638
1092: episode_rewards = 9.0, policy_loss = -0.04505270719528198, value_loss = 1.6695125102996826
1093: episode_rewards = 11.0, policy_loss = 0.13729648292064667, value_loss = 11.36888599395752
1094: episode_rewards = 10.0, policy_loss = 2.587430477142334, value_loss = 8.545705795288086
1095: episode_rewards = 10.0, policy_loss = 0.046737924218177795, value_loss = 2.0183048248291016
1096: episode_rewards = 10.0, policy_loss = 0.04810412973165512, value_loss = 2.138235569000244
1097: episode_rewards = 9.0, policy_loss = -0.04936470836400986, value_loss = 2.3565125465393066
1098: episode_rewards = 9.0, policy_loss = -0.030117092654109, value_loss = 1.242220163345337
1099: episode_rewards = 9.0, policy_loss = -0.05075039714574814, value_loss = 1.940625786781311
1100: episode_rewards = 11.0, policy_loss = 4.147678852081299, value_loss = 13.112318992614746
1101: episode_rewards = 9.0, policy_loss

1204: episode_rewards = 11.0, policy_loss = 5.686311721801758, value_loss = 7.085792541503906
1205: episode_rewards = 12.0, policy_loss = 7.067632675170898, value_loss = 18.97219467163086
1206: episode_rewards = 8.0, policy_loss = -0.33128514885902405, value_loss = 10.668074607849121
1207: episode_rewards = 10.0, policy_loss = 0.10404828935861588, value_loss = 1.94473397731781
1208: episode_rewards = 13.0, policy_loss = 13.311447143554688, value_loss = 28.33692169189453
1209: episode_rewards = 9.0, policy_loss = -0.09301367402076721, value_loss = 0.976841151714325
1210: episode_rewards = 10.0, policy_loss = 0.04504149407148361, value_loss = 0.8860636353492737
1211: episode_rewards = 9.0, policy_loss = -0.1695186346769333, value_loss = 2.4567513465881348
1212: episode_rewards = 10.0, policy_loss = 0.01222538948059082, value_loss = 0.6340938806533813
1213: episode_rewards = 10.0, policy_loss = 0.10260308533906937, value_loss = 1.766053318977356
1214: episode_rewards = 9.0, policy_loss = 

1321: episode_rewards = 9.0, policy_loss = -2.9396965503692627, value_loss = 7.705119609832764
1322: episode_rewards = 10.0, policy_loss = 0.7069692611694336, value_loss = 2.0763907432556152
1323: episode_rewards = 8.0, policy_loss = -1.7696824073791504, value_loss = 6.9148149490356445
1324: episode_rewards = 13.0, policy_loss = 7.947275638580322, value_loss = 50.21440887451172
1325: episode_rewards = 10.0, policy_loss = 0.03728456422686577, value_loss = 5.349637031555176
1326: episode_rewards = 8.0, policy_loss = -1.2517303228378296, value_loss = 7.90971040725708
1327: episode_rewards = 10.0, policy_loss = 0.6367296576499939, value_loss = 1.7967479228973389
1328: episode_rewards = 12.0, policy_loss = 5.456989765167236, value_loss = 17.259475708007812
1329: episode_rewards = 11.0, policy_loss = 2.511838912963867, value_loss = 10.915863037109375
1330: episode_rewards = 13.0, policy_loss = 11.380620956420898, value_loss = 58.59898376464844
1331: episode_rewards = 9.0, policy_loss = -1.72

1415: episode_rewards = 12.0, policy_loss = -4.676243305206299, value_loss = 75.18650817871094
1416: episode_rewards = 13.0, policy_loss = -11.171923637390137, value_loss = 115.50714874267578
1417: episode_rewards = 11.0, policy_loss = -12.75217342376709, value_loss = 73.8601303100586
1418: episode_rewards = 25.0, policy_loss = 61.893978118896484, value_loss = 1030.0321044921875
1419: episode_rewards = 21.0, policy_loss = 24.27030372619629, value_loss = 515.863037109375
1420: episode_rewards = 13.0, policy_loss = -4.334789276123047, value_loss = 46.50079345703125
1421: episode_rewards = 15.0, policy_loss = -7.5800909996032715, value_loss = 151.95907592773438
1422: episode_rewards = 22.0, policy_loss = 32.7385368347168, value_loss = 561.2296142578125
1423: episode_rewards = 19.0, policy_loss = 17.728660583496094, value_loss = 335.5420227050781
1424: episode_rewards = 21.0, policy_loss = 18.205242156982422, value_loss = 461.72509765625
1425: episode_rewards = 27.0, policy_loss = 61.57584

1507: episode_rewards = 157.0, policy_loss = 2905.837158203125, value_loss = 239158.40625
1508: episode_rewards = 157.0, policy_loss = 3033.677001953125, value_loss = 244622.96875
1509: episode_rewards = 129.0, policy_loss = 2205.07373046875, value_loss = 145756.421875
1510: episode_rewards = 200.0, policy_loss = 4952.796875, value_loss = 433243.15625
1511: episode_rewards = 35.0, policy_loss = -108.16517639160156, value_loss = 3629.47314453125
1512: episode_rewards = 166.0, policy_loss = 3282.666259765625, value_loss = 271840.34375
1513: episode_rewards = 23.0, policy_loss = -163.17848205566406, value_loss = 2840.414794921875
1514: episode_rewards = 104.0, policy_loss = 1315.236328125, value_loss = 77428.1953125
1515: episode_rewards = 200.0, policy_loss = 4717.39599609375, value_loss = 413982.1875
1516: episode_rewards = 157.0, policy_loss = 2830.8056640625, value_loss = 242601.9375
1517: episode_rewards = 147.0, policy_loss = 2700.991943359375, value_loss = 201206.703125
1518: episo

1601: episode_rewards = 200.0, policy_loss = 4085.4775390625, value_loss = 342065.65625
1602: episode_rewards = 200.0, policy_loss = 3853.398681640625, value_loss = 329018.59375
1603: episode_rewards = 122.0, policy_loss = 1203.7279052734375, value_loss = 92650.390625
1604: episode_rewards = 200.0, policy_loss = 3989.7890625, value_loss = 315369.875
1605: episode_rewards = 200.0, policy_loss = 3825.44482421875, value_loss = 328318.46875
1606: episode_rewards = 162.0, policy_loss = 2285.770751953125, value_loss = 192195.90625
1607: episode_rewards = 183.0, policy_loss = 2995.246826171875, value_loss = 256143.328125
1608: episode_rewards = 200.0, policy_loss = 3851.282470703125, value_loss = 326062.34375
1609: episode_rewards = 182.0, policy_loss = 3086.1943359375, value_loss = 266647.15625
1610: episode_rewards = 200.0, policy_loss = 3891.668212890625, value_loss = 316801.09375
1611: episode_rewards = 129.0, policy_loss = 1575.6568603515625, value_loss = 94647.3515625
1612: episode_rewa

1696: episode_rewards = 170.0, policy_loss = 1933.4473876953125, value_loss = 183480.078125
1697: episode_rewards = 149.0, policy_loss = 1589.3983154296875, value_loss = 123651.203125
1698: episode_rewards = 200.0, policy_loss = 2918.37744140625, value_loss = 272174.28125
1699: episode_rewards = 170.0, policy_loss = 2206.1494140625, value_loss = 183093.359375
1700: episode_rewards = 200.0, policy_loss = 3168.39599609375, value_loss = 257195.625
1701: episode_rewards = 200.0, policy_loss = 3257.19189453125, value_loss = 258817.78125
1702: episode_rewards = 189.0, policy_loss = 2666.194091796875, value_loss = 228897.890625
1703: episode_rewards = 187.0, policy_loss = 2647.990234375, value_loss = 227467.515625
1704: episode_rewards = 159.0, policy_loss = 1747.6080322265625, value_loss = 155466.078125
1705: episode_rewards = 200.0, policy_loss = 2888.162109375, value_loss = 264980.21875
1706: episode_rewards = 190.0, policy_loss = 2715.18798828125, value_loss = 240806.734375
1707: episode_

1790: episode_rewards = 200.0, policy_loss = 2861.952392578125, value_loss = 228184.96875
1791: episode_rewards = 200.0, policy_loss = 2796.263671875, value_loss = 201330.46875
1792: episode_rewards = 200.0, policy_loss = 2548.916015625, value_loss = 223180.53125
1793: episode_rewards = 200.0, policy_loss = 2348.494140625, value_loss = 211709.75
1794: episode_rewards = 200.0, policy_loss = 2547.34228515625, value_loss = 218190.203125
1795: episode_rewards = 200.0, policy_loss = 2478.6103515625, value_loss = 209455.03125
1796: episode_rewards = 105.0, policy_loss = 185.89710998535156, value_loss = 29286.986328125
1797: episode_rewards = 200.0, policy_loss = 2224.85888671875, value_loss = 208186.75
1798: episode_rewards = 200.0, policy_loss = 2601.0087890625, value_loss = 206835.046875
1799: episode_rewards = 200.0, policy_loss = 2384.2939453125, value_loss = 206695.15625
1800: episode_rewards = 200.0, policy_loss = 2552.56298828125, value_loss = 208312.375
1801: episode_rewards = 200.0,

1882: episode_rewards = 200.0, policy_loss = 2060.97998046875, value_loss = 169322.578125
1883: episode_rewards = 200.0, policy_loss = 2024.3671875, value_loss = 172820.296875
1884: episode_rewards = 172.0, policy_loss = 1234.513427734375, value_loss = 138708.96875
1885: episode_rewards = 200.0, policy_loss = 2035.096923828125, value_loss = 177234.359375
1886: episode_rewards = 200.0, policy_loss = 2083.822509765625, value_loss = 169474.125
1887: episode_rewards = 200.0, policy_loss = 2190.9970703125, value_loss = 171762.71875
1888: episode_rewards = 200.0, policy_loss = 2069.1796875, value_loss = 162745.6875
1889: episode_rewards = 200.0, policy_loss = 1762.640869140625, value_loss = 179456.90625
1890: episode_rewards = 200.0, policy_loss = 1797.278076171875, value_loss = 183658.265625
1891: episode_rewards = 200.0, policy_loss = 2250.953125, value_loss = 168037.671875
1892: episode_rewards = 200.0, policy_loss = 1950.8798828125, value_loss = 179444.421875
1893: episode_rewards = 200.

1975: episode_rewards = 174.0, policy_loss = 929.8939208984375, value_loss = 117288.015625
1976: episode_rewards = 200.0, policy_loss = 1517.2208251953125, value_loss = 156841.484375
1977: episode_rewards = 200.0, policy_loss = 1627.0682373046875, value_loss = 154239.921875
1978: episode_rewards = 200.0, policy_loss = 1706.16943359375, value_loss = 155471.78125
1979: episode_rewards = 200.0, policy_loss = 1461.419677734375, value_loss = 156414.078125
1980: episode_rewards = 191.0, policy_loss = 1272.302490234375, value_loss = 145794.015625
1981: episode_rewards = 200.0, policy_loss = 1481.617431640625, value_loss = 161632.640625
1982: episode_rewards = 200.0, policy_loss = 1618.44189453125, value_loss = 157397.0
1983: episode_rewards = 200.0, policy_loss = 1647.8310546875, value_loss = 160619.765625
1984: episode_rewards = 200.0, policy_loss = 1584.98828125, value_loss = 157966.125
1985: episode_rewards = 200.0, policy_loss = 1667.3214111328125, value_loss = 156353.734375
1986: episode

[2017-08-11 15:08:55,784] Starting new video recorder writing to /Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole/openaigym.video.0.85721.video002000.mp4


1998: episode_rewards = 200.0, policy_loss = 1496.383056640625, value_loss = 154714.140625
1999: episode_rewards = 200.0, policy_loss = 1553.1297607421875, value_loss = 151371.015625


In [11]:
env.render(close=True)
env.close()

[2017-08-11 15:08:57,476] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/fchua/Documents/torch_projects/pytorch_tutorials/pytorch-deep-rl/cartpole')


In [12]:
env = gym.make('Pendulum-v0')

[2017-08-11 15:30:57,078] Making new env: Pendulum-v0


In [21]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.shape[0]
print('input_size = {0}, output_size = {1}'.format(input_size, output_size))

input_size = 3, output_size = 1


In [22]:
model = ActorCritic(input_size, output_size)
optimizer = torch.optim.Adam(model.parameters())

In [23]:
# vanilla policy gradient
for i in range(2000):
  current_state = env.reset() # an array of 4 values
  done = False
  episode_reward = 0
  
  values = []
  logprobs = []
  rewards = []

  while not done:
     # forward propagation on policy using current_state
    value, action_real = model(Variable(torch.from_numpy(current_state).float().unsqueeze(0)))
    break
    # value and action_real are variables
#     action_logprob = F.log_softmax(action_real) # returns a variable
    
#     action_prob = F.softmax(action_real) # returns a variable
#     action = action_prob.multinomial().data # returns a torch tensor
    
#     logprob = action_logprob.gather(1, Variable(action))
#     current_state, reward, done, _ = env.step(action.numpy()[0,0])
    
#     values.append(value) # variable
#     logprobs.append(logprob) # variable
#     rewards.append(reward) # numpy
    
#     episode_reward += reward
  
#   R = 0
#   value_loss = 0
#   policy_loss = 0
#   for j in reversed(range(len(rewards))):
#     R = rewards[j] + gamma * R # numpy
#     advantage = R - values[j]
#     value_loss += advantage.pow(2)
#     policy_loss -= logprobs[j] * advantage
  
#   optimizer.zero_grad()
#   value_loss.backward(retain_variables=True)
#   policy_loss.backward()
#   optimizer.step()
  
#   print('{0}: episode_rewards = {1}, policy_loss = {2}, value_loss = {3}'.format(
#     i, episode_reward, policy_loss.data[0,0], value_loss.data[0,0]))

In [25]:
F.tanh(action_real)

Variable containing:
 0.9017
[torch.FloatTensor of size 1x1]