In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

In [2]:
import pandas as pd, numpy as np, os, sys, cv2
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, HTML

font = {'size'   : 18}
matplotlib.rc('font', **font)

In [3]:
import gym

env = gym.make('CartPole-v0')
obs = env.reset()
print(obs)
# horizontal position, velocity (right > 0), angle of pole (right > 0), angular velocity (right > 0)

[0.04096331 0.02844352 0.03862428 0.03755599]


In [4]:
env.render()

True

In [5]:
img = env.render(mode = 'rgb_array')
print(img.shape)

(400, 600, 3)


In [6]:
env.action_space
# 0: accelerate left 1: right

Discrete(2)

In [7]:
action = 1
obs, reward, done, info = env.step(action)
print('obs', obs)
print('reward', reward)
print('done', done)
print('info', info)
env.render()

obs [ 0.04153218  0.22299092  0.0393754  -0.24269475]
reward 1.0
done False
info {}


True

In [8]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

In [9]:
totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        # reward is always 1.0 for every step, no matter what in this env
        # info: debugging only. never use it for training. It is cheating
        episode_rewards += reward
        if done:
            # pole tilts too much, It is time to reset
            break
    totals.append(episode_rewards)

In [10]:
print(np.mean(totals), np.std(totals), np.min(totals), np.max(totals))

42.774 8.673115011343963 24.0 68.0


In [11]:
import tensorflow as tf

n_inputs = 4
n_hidden = 4
n_outputs = 1
initializer = tf.contrib.layers.variance_scaling_initializer()

In [12]:
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs, kernel_initializer=initializer)
outputs = tf.nn.sigmoid(logits)

In [14]:
p_left_and_right = tf.concat(axis=1, values=[outputs, 1-outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
init = tf.global_variables_initializer()
y = 1.0 - tf.to_float(action) # 1.0 for left and 0.0 for right

In [None]:
learning_rate = 0.01
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, var in grads_and_vars]