In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# In google collab, uncomment this:
# !wget https://bit.ly/2FMJP5K -O setup.py && bash setup.py


 # OpenAI Gym #

 We're gonna spend several next weeks learning algorithms that solve decision processes. We are then in need of some interesting decision problems to test our algorithms.

That's where OpenAI gym comes into play. It's a python library that wraps many classical decision problems including robot control, videogames and board games.

So here's how it works:

In [3]:
import gym
from gym import wrappers
from time import time

env = gym.make("MountainCar-v0")
env = wrappers.Monitor(env, './videos/' + str(time()))

# plt.imshow(env.render('rgb_array')) 
# plt.show()

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(2,)
Action space: Discrete(3)


## Gym interface ##

The three main methods of an environment are

* **reset()** - reset envronment to initial state, retrun firs observation
* **render()** - show current environment state (a more colorful version :) )
* **step(a)** commit action **a** and return (new observation, reward, is done, info)
    * *new observation* - an observation right after commiting the action **a**
    * *reward* - a number representing your reward for commiting action **a**
    * *is done* - True if the MDP has just finished, False if still in progress
    * *info* - some auxilary stuff about what just happened. Ignore it for now.


## Play with it

Below is the code that drives the car to the right.

However, it doesn't reach the flag at the far right due to gravity.

**Your task** is to fix it. Find a strategy that reaches the flag.

You're not required to build any sophisticated algorithms for now, feel free to hard-code :)

Hint: your action at each step should depend either on t or on s.

In [4]:
# def test(args, env, agent):
#     if args.record:
#         if 'env' in vars(args):
#             env = wrappers.Monitor(env, './videos/' + args.env + str(time()) + '/')
#         else:
#             env = wrappers.Monitor(env, './videos/' + str(time()) + '/')
#     test_rewards = []
#     test_start = time()
#     test_steps = 0
#     for iteration in range(1, 1 + args.n_test_iter):
#         state = env.reset()
#         iter_rewards = 0.0
#         done = False
#         while not done:
#             test_steps += 1
#             action, _ = agent.forward(state)
#             state, reward, done, _ = env.step(action)
#             iter_rewards += reward
#         test_rewards.append(iter_rewards)
#     print_stats('Test', test_rewards, args.n_test_iter,
#                 time() - test_start, test_steps, 0, agent)
#     return test_rewards

In [5]:
# create env manually to set time limit. Please don't change this.
TIME_LIMIT = 250
env = gym.wrappers.TimeLimit(env, max_episode_steps=TIME_LIMIT + 1)

obs0 = env.reset()
print("initial observation code:", obs0)
# Note: in MountainCar, observation is just two numbers: car position and velocity

print("taking action 2 (right)")
new_obs, reward, is_done, _ = env.step(2)
# Note: as you can see, the car has moved to the right slightly (around 0.0005)


print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)

actions = {'left': 0, 'stop': 1, 'right': 2}

# prepare "display"
#% matplotlib inline
from IPython.display import clear_output
peak_cnt = 0
prev_vel = obs0[1]
prev_coord = obs0[0]
for t in range(TIME_LIMIT):

    # change the line below to reach the flag
    s, r, done, _ = env.step(actions['right'])
  #  print(s)
    if (s[1] > 0 or s[0]< -1):
        s, r, done, _ = env.step(actions['right'])
    else:
        s, r, done, _ = env.step(actions['left'])
        if prev_vel*s[1] < 0:
            peak_cnt += 1
            print("--- reached peak")
    prev_vel = s[1]
    # draw game image on display
    # clear_output(True)
    # plt.imshow(env.render('rgb_array'))

    if done:
        print("Well done!")
        break
else:
    print("Time limit exceeded. Try again.");


initial observation code: [-0.42701333  0.        ]
taking action 2 (right)


new observation code: [-4.26727625e-01  2.85703169e-04]
reward: -1.0
is game over?: False


--- reached peak


--- reached peak


Well done!


In [6]:
assert s[0] > 0.47
print("You solved it!")

You solved it!
