In [None]:
!sudo apt-get update >& /dev/null
!apt-get install -y xvfb x11-utils >& /dev/null
!pip install gym==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.* >& /dev/null

In [None]:
n_states = 36
iter_max = 10000
initial_lr = 1.0
min_lr = 0.003
gamma = 1
eps_max = 10000
eps = 0.02

In [None]:
def exec_episodes(env, policy=None, render=False):
  obs = env.reset()
  total_reward = 0
  step_idx = 0
  for _ in range(eps_max):
    if render:
      env.render()
    if policy is None:
      action = env.action_space.sample()
    else:
      pos, vel = get_state_vals(env, obs)
      action = policy[pos][vel]
    obs, reward, done, _ = env.step(action)
    total_reward += gamma ** step_idx*reward
    step_idx += 1
    if done:
      break
  return total_reward

In [None]:
!mkdir ./video

In [None]:
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

display = Display(visible=0, size=(1368, 768))
display.start()

In [None]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay loop controls style="height: 256px;"> <source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

In [None]:
import gym
from gym.wrappers import Monitor

def wrap_env(env):
  ''''
  This monitoring wrapper records the outputs and save them in an mp4 file
  '''
  env = Monitor(env, './video', force=True)
  return env

env = wrap_env(gym.make('MountainCar-v0'))
env.seed(16)

print('Action space for Mountain car env: '+str(env.action_space))
print('Observation space for Mountain car env: '+str(env.observation_space))

In [None]:
def get_state_vals( env, obs):
  env_low = env.observation_space.low
  env_high = env.observation_space.high
  env_dx = (env_high - env_low)/n_states
  pos = int((obs[0] - env_low[0])/env_dx[0])
  vel = int((obs[1] - env_low[1])/env_dx[1])
  return pos, vel

In [None]:
import numpy as np

q_table = np.zeros((n_states, n_states, 3))

In [None]:
def eval_policy(env):
  solution_policy = np.argmax(q_table, axis=2)
  scores = [exec_episodes(env, solution_policy, False) for _ in range(100)]
  return np.mean(scores)

In [None]:
for i in range(iter_max):
  obs = env.reset()
  eta = max(min_lr, initial_lr*(0.85**(i//100)))
  for j in range(eps_max):
    pos, vel = get_state_vals(env, obs)
    if np.random.uniform(0, 1) < 0.02:
      action = np.random.choice(env.action_space.n)
    else:
      logits = q_table[pos][vel]
      logits_exp = np.exp(logits)
      probs = logits_exp / np.sum(logits_exp)
      action = np.random.choice(env.action_space.n, p=probs)
    obs, reward, done, _ = env.step(action)

    pos_, vel_ = get_state_vals(env, obs)
    q_table[pos][vel][action] = q_table[pos][vel][action] +\
      eta * (reward+gamma*np.max(q_table[pos_][vel_])-q_table[pos][vel][action])
    if done:
      break
  if i % 1000 == 0:
    print('Iteration: %d has been completed.'%(i+1))

In [None]:
show_video()

In [None]:
eval_policy(env)

In [None]:
!zip -r /content/file.zip /content/video/

from google.colab import files
files.download("/content/file.zip")