# Solves the Cartpole problem using Policy Gradients in Tensorflow

written October 2016 by Sam Greydanus

inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt

import numpy as np
import gym
import tensorflow as tf

In [2]:
n_obs = 4              # dimensionality of observations
h = 128                # number of hidden layer neurons
n_actions = 2          # number of available actions

learning_rate = 1e-2
gamma = .9             # discount factor for reward
decay = 0.9            # decay rate for RMSProp gradients

In [3]:
tf_model = {}
with tf.variable_scope('layer_one',reuse=False):
    xavier_l1 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(n_obs), dtype=tf.float32)
    tf_model['W1'] = tf.get_variable("W1", [n_obs, h], initializer=xavier_l1)
with tf.variable_scope('layer_two',reuse=False):
    xavier_l2 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(h), dtype=tf.float32)
    tf_model['W2'] = tf.get_variable("W2", [h,n_actions], initializer=xavier_l2)

In [4]:
def tf_discount_rewards(tf_r): #tf_r ~ [game_steps,1]
    discount_f = lambda a, v: a*gamma + v;
    tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False]))
    tf_discounted_r = tf.reverse(tf_r_reverse,[True, False])
    return tf_discounted_r

def tf_policy_forward(x): #x ~ [1,D]
    h = tf.matmul(x, tf_model['W1'])
    h = tf.nn.relu(h)
    logp = tf.matmul(h, tf_model['W2'])
    p = tf.nn.softmax(logp)
    return p

def plt_dynamic(x, y, ax, colors=['b']):
    for color in colors:
        ax.plot(x, y, color)
    fig.canvas.draw()

In [5]:
env = gym.make("CartPole-v0")
observation = env.reset()
xs,rs,ys = [],[],[]
running_reward = 10 # usually starts around 10 for cartpole
reward_sum = 0
episode_number = 0
total_steps = 500

[2016-10-17 12:18:03,911] Making new env: CartPole-v0
[2016-10-17 12:18:03,933] Creating monitor directory /tmp/gym-results
[2016-10-17 12:18:03,942] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31424.video000000.mp4


In [6]:
#placeholders
tf_x = tf.placeholder(dtype=tf.float32, shape=[None, n_obs],name="tf_x")
tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="tf_y")
tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr")

#gradient processing (PG magic)
tf_discounted_epr = tf_discount_rewards(tf_epr)
tf_mean, tf_variance= tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments")
tf_discounted_epr -= tf_mean
tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6)

# initialize tf graph
tf_aprob = tf_policy_forward(tf_x)
loss = tf.nn.l2_loss(tf_y-tf_aprob)
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay)
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
train_op = optimizer.apply_gradients(tf_grads)

sess = tf.InteractiveSession()
tf.initialize_all_variables().run()

In [7]:
fig,ax = plt.subplots(1,1)
ax.set_xlabel('X') ; ax.set_ylabel('Y')
ax.set_xlim(0,total_steps) ; ax.set_ylim(0,200)
pxs, pys = [], []

print 'episode {}: starting up...'.format(episode_number)
while episode_number <= total_steps and running_reward < 225:
#     if episode_number%25==0: env.render()

    # stochastically sample a policy from the network
    x = observation
    feed = {tf_x: np.reshape(x, (1,-1))}
    aprob = sess.run(tf_aprob,feed) ; aprob = aprob[0,:]
    action = np.random.choice(n_actions, p=aprob)
    label = np.zeros_like(aprob) ; label[action] = 1

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward
    
    # record game history
    xs.append(x) ; ys.append(label) ; rs.append(reward)
    
    if done:
        running_reward = running_reward * 0.99 + reward_sum * 0.01
        epx = np.vstack(xs)
        epr = np.vstack(rs)
        epy = np.vstack(ys)
        xs,rs,ys = [],[],[] # reset game history
        
        feed = {tf_x: epx, tf_epr: epr, tf_y: epy}
        _ = sess.run(train_op,feed) # parameter update

        # visualization
        pxs.append(episode_number)
        pys.append(running_reward)
        if episode_number % 25 == 0:
            print 'ep: {}, reward: {}, mean reward: {:3f}'.format(episode_number, reward_sum, running_reward)
            plt_dynamic(pxs, pys, ax)
        
        # lame stuff
        episode_number += 1 # the Next Episode
        observation = env.reset() # reset env
        reward_sum = 0
        
plt_dynamic(pxs, pys, ax)
if running_reward > 225:
    print "ep: {}: SOLVED! (running reward hit {} which is greater than 200)".format(
        episode_number, running_reward)

<IPython.core.display.Javascript object>

episode 0: starting up...


[2016-10-17 12:18:05,321] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31424.video000001.mp4


ep: 0, reward: 14.0, mean reward: 10.040000


[2016-10-17 12:18:06,327] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31424.video000008.mp4
[2016-10-17 12:18:07,365] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31424.video000027.mp4


ep: 25, reward: 73.0, mean reward: 16.524940


[2016-10-17 12:18:08,748] Ending episode 38 because it reached the timestep limit of 200.
[2016-10-17 12:18:09,615] Ending episode 50 because it reached the timestep limit of 200.
[2016-10-17 12:18:09,881] Ending episode 52 because it reached the timestep limit of 200.


ep: 50, reward: 116.0, mean reward: 33.782137


[2016-10-17 12:18:10,007] Ending episode 53 because it reached the timestep limit of 200.
[2016-10-17 12:18:10,170] Ending episode 55 because it reached the timestep limit of 200.
[2016-10-17 12:18:10,640] Ending episode 61 because it reached the timestep limit of 200.
[2016-10-17 12:18:10,872] Ending episode 64 because it reached the timestep limit of 200.
[2016-10-17 12:18:10,904] Starting new video recorder writing to /tmp/gym-results/openaigym.video.0.31424.video000064.mp4
[2016-10-17 12:18:13,509] Ending episode 66 because it reached the timestep limit of 200.
[2016-10-17 12:18:13,751] Ending episode 68 because it reached the timestep limit of 200.
[2016-10-17 12:18:13,958] Ending episode 70 because it reached the timestep limit of 200.
[2016-10-17 12:18:14,098] Ending episode 71 because it reached the timestep limit of 200.
[2016-10-17 12:18:14,226] Ending episode 72 because it reached the timestep limit of 200.
[2016-10-17 12:18:14,469] Ending episode 74 because it reached the t

ep: 75, reward: 102.0, mean reward: 61.625656


[2016-10-17 12:18:15,069] Ending episode 80 because it reached the timestep limit of 200.
[2016-10-17 12:18:15,635] Ending episode 85 because it reached the timestep limit of 200.
[2016-10-17 12:18:15,756] Ending episode 86 because it reached the timestep limit of 200.
[2016-10-17 12:18:15,973] Ending episode 88 because it reached the timestep limit of 200.
[2016-10-17 12:18:16,134] Ending episode 90 because it reached the timestep limit of 200.
[2016-10-17 12:18:16,743] Ending episode 101 because it reached the timestep limit of 200.


ep: 100, reward: 200.0, mean reward: 77.275739


[2016-10-17 12:18:17,114] Ending episode 105 because it reached the timestep limit of 200.
[2016-10-17 12:18:17,359] Ending episode 108 because it reached the timestep limit of 200.
[2016-10-17 12:18:17,550] Ending episode 110 because it reached the timestep limit of 200.
[2016-10-17 12:18:17,653] Ending episode 111 because it reached the timestep limit of 200.
[2016-10-17 12:18:17,867] Ending episode 113 because it reached the timestep limit of 200.
[2016-10-17 12:18:17,985] Ending episode 114 because it reached the timestep limit of 200.
[2016-10-17 12:18:18,189] Ending episode 116 because it reached the timestep limit of 200.
[2016-10-17 12:18:18,302] Ending episode 117 because it reached the timestep limit of 200.
[2016-10-17 12:18:18,744] Ending episode 121 because it reached the timestep limit of 200.
[2016-10-17 12:18:18,870] Ending episode 122 because it reached the timestep limit of 200.
[2016-10-17 12:18:18,986] Ending episode 123 because it reached the timestep limit of 200.

ep: 125, reward: 131.0, mean reward: 98.144202


[2016-10-17 12:18:21,704] Ending episode 127 because it reached the timestep limit of 200.
[2016-10-17 12:18:21,903] Ending episode 129 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,012] Ending episode 130 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,135] Ending episode 131 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,265] Ending episode 132 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,399] Ending episode 133 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,509] Ending episode 134 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,620] Ending episode 135 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,737] Ending episode 136 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,847] Ending episode 137 because it reached the timestep limit of 200.
[2016-10-17 12:18:22,944] Ending episode 138 because it reached the timestep limit of 200.

ep: 150, reward: 200.0, mean reward: 116.643226


[2016-10-17 12:18:24,545] Ending episode 153 because it reached the timestep limit of 200.
[2016-10-17 12:18:24,671] Ending episode 154 because it reached the timestep limit of 200.
[2016-10-17 12:18:24,794] Ending episode 155 because it reached the timestep limit of 200.
[2016-10-17 12:18:24,993] Ending episode 157 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,111] Ending episode 158 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,219] Ending episode 159 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,336] Ending episode 160 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,449] Ending episode 161 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,556] Ending episode 162 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,663] Ending episode 163 because it reached the timestep limit of 200.
[2016-10-17 12:18:25,765] Ending episode 164 because it reached the timestep limit of 200.

ep: 175, reward: 200.0, mean reward: 134.461088


[2016-10-17 12:18:27,505] Ending episode 178 because it reached the timestep limit of 200.
[2016-10-17 12:18:27,612] Ending episode 179 because it reached the timestep limit of 200.
[2016-10-17 12:18:27,734] Ending episode 180 because it reached the timestep limit of 200.
[2016-10-17 12:18:27,876] Ending episode 181 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,091] Ending episode 183 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,265] Ending episode 185 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,389] Ending episode 186 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,509] Ending episode 187 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,624] Ending episode 188 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,730] Ending episode 189 because it reached the timestep limit of 200.
[2016-10-17 12:18:28,844] Ending episode 190 because it reached the timestep limit of 200.

ep: 200, reward: 200.0, mean reward: 147.808051


[2016-10-17 12:18:30,489] Ending episode 203 because it reached the timestep limit of 200.
[2016-10-17 12:18:30,613] Ending episode 204 because it reached the timestep limit of 200.
[2016-10-17 12:18:30,722] Ending episode 205 because it reached the timestep limit of 200.
[2016-10-17 12:18:30,838] Ending episode 206 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,000] Ending episode 208 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,125] Ending episode 209 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,240] Ending episode 210 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,355] Ending episode 211 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,462] Ending episode 212 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,571] Ending episode 213 because it reached the timestep limit of 200.
[2016-10-17 12:18:31,671] Ending episode 214 because it reached the timestep limit of 200.

ep: 225, reward: 200.0, mean reward: 157.029076


[2016-10-17 12:18:37,287] Ending episode 234 because it reached the timestep limit of 200.
[2016-10-17 12:18:37,519] Ending episode 236 because it reached the timestep limit of 200.
[2016-10-17 12:18:37,731] Ending episode 238 because it reached the timestep limit of 200.
[2016-10-17 12:18:37,854] Ending episode 239 because it reached the timestep limit of 200.
[2016-10-17 12:18:37,982] Ending episode 240 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,098] Ending episode 241 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,219] Ending episode 242 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,338] Ending episode 243 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,459] Ending episode 244 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,581] Ending episode 245 because it reached the timestep limit of 200.
[2016-10-17 12:18:38,706] Ending episode 246 because it reached the timestep limit of 200.

ep: 250, reward: 200.0, mean reward: 163.589072


[2016-10-17 12:18:39,632] Ending episode 253 because it reached the timestep limit of 200.
[2016-10-17 12:18:39,755] Ending episode 254 because it reached the timestep limit of 200.
[2016-10-17 12:18:39,880] Ending episode 255 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,002] Ending episode 256 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,121] Ending episode 257 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,359] Ending episode 258 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,485] Ending episode 259 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,613] Ending episode 260 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,732] Ending episode 261 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,861] Ending episode 262 because it reached the timestep limit of 200.
[2016-10-17 12:18:40,991] Ending episode 263 because it reached the timestep limit of 200.

ep: 275, reward: 200.0, mean reward: 169.745041


[2016-10-17 12:18:42,943] Ending episode 279 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,065] Ending episode 280 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,185] Ending episode 281 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,463] Ending episode 283 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,594] Ending episode 284 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,716] Ending episode 285 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,851] Ending episode 286 because it reached the timestep limit of 200.
[2016-10-17 12:18:43,980] Ending episode 287 because it reached the timestep limit of 200.
[2016-10-17 12:18:44,108] Ending episode 288 because it reached the timestep limit of 200.
[2016-10-17 12:18:44,231] Ending episode 289 because it reached the timestep limit of 200.
[2016-10-17 12:18:44,363] Ending episode 290 because it reached the timestep limit of 200.

ep: 300, reward: 200.0, mean reward: 176.051112


[2016-10-17 12:18:46,047] Ending episode 303 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,170] Ending episode 304 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,296] Ending episode 305 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,419] Ending episode 306 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,549] Ending episode 307 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,670] Ending episode 308 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,789] Ending episode 309 because it reached the timestep limit of 200.
[2016-10-17 12:18:46,912] Ending episode 310 because it reached the timestep limit of 200.
[2016-10-17 12:18:47,036] Ending episode 311 because it reached the timestep limit of 200.
[2016-10-17 12:18:47,162] Ending episode 312 because it reached the timestep limit of 200.
[2016-10-17 12:18:47,284] Ending episode 313 because it reached the timestep limit of 200.

ep: 325, reward: 200.0, mean reward: 181.372043


[2016-10-17 12:18:49,192] Ending episode 328 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,311] Ending episode 329 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,433] Ending episode 330 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,557] Ending episode 331 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,683] Ending episode 332 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,807] Ending episode 333 because it reached the timestep limit of 200.
[2016-10-17 12:18:49,931] Ending episode 334 because it reached the timestep limit of 200.
[2016-10-17 12:18:50,052] Ending episode 335 because it reached the timestep limit of 200.
[2016-10-17 12:18:50,178] Ending episode 336 because it reached the timestep limit of 200.
[2016-10-17 12:18:50,301] Ending episode 337 because it reached the timestep limit of 200.
[2016-10-17 12:18:50,431] Ending episode 338 because it reached the timestep limit of 200.

ep: 350, reward: 200.0, mean reward: 185.187344


[2016-10-17 12:18:55,688] Ending episode 353 because it reached the timestep limit of 200.
[2016-10-17 12:18:55,820] Ending episode 354 because it reached the timestep limit of 200.
[2016-10-17 12:18:55,957] Ending episode 355 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,162] Ending episode 357 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,368] Ending episode 359 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,497] Ending episode 360 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,609] Ending episode 361 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,732] Ending episode 362 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,855] Ending episode 363 because it reached the timestep limit of 200.
[2016-10-17 12:18:56,979] Ending episode 364 because it reached the timestep limit of 200.
[2016-10-17 12:18:57,114] Ending episode 365 because it reached the timestep limit of 200.

ep: 375, reward: 94.0, mean reward: 186.130176


[2016-10-17 12:18:58,739] Ending episode 378 because it reached the timestep limit of 200.
[2016-10-17 12:18:58,869] Ending episode 379 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,001] Ending episode 380 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,315] Ending episode 383 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,510] Ending episode 385 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,630] Ending episode 386 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,825] Ending episode 388 because it reached the timestep limit of 200.
[2016-10-17 12:18:59,951] Ending episode 389 because it reached the timestep limit of 200.
[2016-10-17 12:19:00,066] Ending episode 390 because it reached the timestep limit of 200.
[2016-10-17 12:19:00,194] Ending episode 391 because it reached the timestep limit of 200.
[2016-10-17 12:19:00,408] Ending episode 393 because it reached the timestep limit of 200.

ep: 400, reward: 200.0, mean reward: 186.718380


[2016-10-17 12:19:01,753] Ending episode 403 because it reached the timestep limit of 200.
[2016-10-17 12:19:01,873] Ending episode 404 because it reached the timestep limit of 200.
[2016-10-17 12:19:01,997] Ending episode 405 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,142] Ending episode 406 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,276] Ending episode 407 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,393] Ending episode 408 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,522] Ending episode 409 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,649] Ending episode 410 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,768] Ending episode 411 because it reached the timestep limit of 200.
[2016-10-17 12:19:02,892] Ending episode 412 because it reached the timestep limit of 200.
[2016-10-17 12:19:03,017] Ending episode 413 because it reached the timestep limit of 200.

ep: 425, reward: 134.0, mean reward: 189.009272


[2016-10-17 12:19:04,880] Ending episode 428 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,001] Ending episode 429 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,130] Ending episode 430 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,256] Ending episode 431 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,382] Ending episode 432 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,596] Ending episode 434 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,718] Ending episode 435 because it reached the timestep limit of 200.
[2016-10-17 12:19:05,963] Ending episode 437 because it reached the timestep limit of 200.
[2016-10-17 12:19:06,093] Ending episode 438 because it reached the timestep limit of 200.
[2016-10-17 12:19:06,218] Ending episode 439 because it reached the timestep limit of 200.
[2016-10-17 12:19:06,349] Ending episode 440 because it reached the timestep limit of 200.

ep: 450, reward: 200.0, mean reward: 189.794273


[2016-10-17 12:19:08,024] Ending episode 453 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,236] Ending episode 455 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,360] Ending episode 456 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,483] Ending episode 457 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,607] Ending episode 458 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,728] Ending episode 459 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,857] Ending episode 460 because it reached the timestep limit of 200.
[2016-10-17 12:19:08,978] Ending episode 461 because it reached the timestep limit of 200.
[2016-10-17 12:19:09,315] Ending episode 464 because it reached the timestep limit of 200.
[2016-10-17 12:19:09,442] Ending episode 465 because it reached the timestep limit of 200.
[2016-10-17 12:19:09,728] Ending episode 468 because it reached the timestep limit of 200.

ep: 475, reward: 197.0, mean reward: 188.057199


[2016-10-17 12:19:10,896] Ending episode 478 because it reached the timestep limit of 200.
[2016-10-17 12:19:11,226] Ending episode 482 because it reached the timestep limit of 200.
[2016-10-17 12:19:11,547] Ending episode 485 because it reached the timestep limit of 200.
[2016-10-17 12:19:11,674] Ending episode 486 because it reached the timestep limit of 200.
[2016-10-17 12:19:11,985] Ending episode 489 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,102] Ending episode 490 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,233] Ending episode 491 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,366] Ending episode 492 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,492] Ending episode 493 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,618] Ending episode 494 because it reached the timestep limit of 200.
[2016-10-17 12:19:12,742] Ending episode 495 because it reached the timestep limit of 200.

ep: 500, reward: 200.0, mean reward: 186.509912


[2016-10-17 12:19:13,711] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/gym-results')


In [9]:
gym.upload("/tmp/gym-results", api_key="sk_zyuHZupaTCEhFagQcURJA")

[2016-10-17 12:19:18,834] [CartPole-v0] Uploading 502 episodes of training data
[2016-10-17 12:19:20,642] [CartPole-v0] Uploading videos of 8 training episodes (66106 bytes)
[2016-10-17 12:19:21,027] [CartPole-v0] Creating evaluation object from /tmp/gym-results with learning curve and training video
[2016-10-17 12:19:21,228] 
****************************************************
You successfully uploaded your evaluation on CartPole-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_hpkg5wFHQ5WUjJnnea0wkw

****************************************************


<Evaluation evaluation id=eval_hpkg5wFHQ5WUjJnnea0wkw at 0x11b15f618> JSON: {
  "created": 1476721161, 
  "env": "CartPole-v0", 
  "id": "eval_hpkg5wFHQ5WUjJnnea0wkw", 
  "object": "evaluation"
}