In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import sys

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures and animations
%matplotlib nbagg
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)


In [2]:
import gym
env = gym.make('MsPacman-v0')
obs = env.reset()

In [3]:
obs.shape

(210, 160, 3)

In [4]:
img = env.render(mode="rgb_array")

In [7]:
plt.figure(figsize=(5,4))
plt.imshow(img)
plt.axis("off")
save_fig("MsPacman")
plt.show()

<IPython.core.display.Javascript object>

Saving figure MsPacman


In [6]:
(img == obs).all()

True

In [8]:
def plot_environment(env, figsize=(5,4)):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    plt.figure(figsize=figsize)
    img = env.render(mode="rgb_array")
    plt.imshow(img)
    plt.axis("off")
    plt.show()

In [9]:
env.action_space

Discrete(9)

In [10]:
env.reset()
for step in range(110):
    env.step(3) #left
for step in range(40):
    env.step(8) #lower-left

In [11]:
plot_environment(env)

<IPython.core.display.Javascript object>

In [12]:
obs, reward, done, info = env.step(0)

In [13]:
obs.shape

(210, 160, 3)

In [20]:
reward

0.0

In [14]:
done

False

In [15]:
info

{'ale.lives': 3}

In [16]:
frames = []

n_max_steps = 1000
n_change_steps = 10

obs = env.reset()
for step in range(n_max_steps):
    img = env.render(mode="rgb_array")
    frames.append(img)
    if step % n_change_steps == 0:
        action = env.action_space.sample() # play randomly
    obs, reward, done, info = env.step(action)
    if done:
        break

In [17]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)

In [18]:
video = plot_animation(frames)
plt.show()

<IPython.core.display.Javascript object>

In [19]:
env.close()

# A simple environment: the Cart-Pole

In [34]:
env = gym.make("CartPole-v0")

In [35]:
obs = env.reset()

In [36]:
obs

array([ 0.00054529,  0.01670092, -0.00316285, -0.01649338])

In [37]:
from PIL import Image, ImageDraw

try:
    from pyglet.gl import gl_info
    openai_cart_pole_rendering = True   # no problem, let's use OpenAI gym's rendering function
except Exception:
    openai_cart_pole_rendering = False  # probably no X server available, let's use our own rendering function

def render_cart_pole(env, obs):
    if openai_cart_pole_rendering:
        # use OpenAI gym's rendering function
        return env.render(mode="rgb_array")
    else:
        # rendering for the cart pole environment (in case OpenAI gym can't do it)
        img_w = 600
        img_h = 400
        cart_w = img_w // 12
        cart_h = img_h // 15
        pole_len = img_h // 3.5
        pole_w = img_w // 80 + 1
        x_width = 2
        max_ang = 0.2
        bg_col = (255, 255, 255)
        cart_col = 0x000000 # Blue Green Red
        pole_col = 0x669acc # Blue Green Red

        pos, vel, ang, ang_vel = obs
        img = Image.new('RGB', (img_w, img_h), bg_col)
        draw = ImageDraw.Draw(img)
        cart_x = pos * img_w // x_width + img_w // x_width
        cart_y = img_h * 95 // 100
        top_pole_x = cart_x + pole_len * np.sin(ang)
        top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)
        draw.line((0, cart_y, img_w, cart_y), fill=0)
        draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart
        draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole
        return np.array(img)

def plot_cart_pole(env, obs):
    plt.close()  # or else nbagg sometimes plots in the previous cell
    img = render_cart_pole(env, obs)
    plt.imshow(img)
    plt.axis("off")
    plt.show()

In [38]:
plot_cart_pole(env, obs)

<IPython.core.display.Javascript object>

In [39]:
env.action_space

Discrete(2)

In [40]:
obs = env.reset()
while True:
    obs, reward, done, info = env.step(0)
    if done:
        break

In [41]:
plt.close()  # or else nbagg sometimes plots in the previous cell
img = render_cart_pole(env, obs)
plt.imshow(img)
plt.axis("off")
save_fig("cart_pole_plot")

<IPython.core.display.Javascript object>

Saving figure cart_pole_plot


In [42]:
img.shape

(400, 600, 3)

In [43]:
obs = env.reset()
while True:
    obs, reward, done, info = env.step(1)
    if done:
        break

In [44]:
plot_cart_pole(env, obs)

<IPython.core.display.Javascript object>

In [47]:
frames = []

n_max_steps = 1000
n_change_steps = 10

obs = env.reset()
total = []
for step in range(n_max_steps):
    img = render_cart_pole(env, obs)
    frames.append(img)

    # hard-coded policy
    position, velocity, angle, angular_velocity = obs
    if angle < 0:
        action = 0
    else:
        action = 1

    obs, reward, done, info = env.step(action)
    if done:
        break

In [48]:
video = plot_animation(frames)
plt.show()

<IPython.core.display.Javascript object>

In [53]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle <0 else 1

total = []
for episode in range(500):
    rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        rewards += reward
        if done:
            break
    total.append(rewards)


In [54]:
print('mean: {}  std: {}  min: {} max: {}'.format(np.mean(total) , np.std(total), np.min(total), np.max(total)))

mean: 42.756  std: 8.908449023258763  min: 24.0 max: 72.0
