# RL interface

## Goal:

- Familiarize yourself with the programming interface

In [None]:
import gym
import chula_rl as rl
import os
import numpy as np
import random
from collections import deque, defaultdict

In [None]:
def make_env():
    # define the environment here we use Gridworld
    env = rl.env.Gridworld(shape=(4, 3),
                           start=(2, 0),
                           goal=(1, 2),
                           move_reward=-1)
    env = rl.env.wrapper.ClipEpisodeLength(env, n_max_length=10)
    env = rl.env.wrapper.EpisodeSummary(env)
    return env

## Here is our gridword environment

In [None]:
env = make_env()
env.reset()
env.render()
print('Map: -1 = trap, 1 = goal, 2 = current')

You start at (2, 0) the goal is at (1, 2) where exists the reward of 5. 

There is no penalty of hitting the wall, you'll just bounce off it.

### Possible actions:

In [None]:
print('kind of action space:', env.action_space)
print('number of possible actions:', env.action_space.n)

You just select a number 0-3 then.

Here:
- 0 = up
- 1 = left
- 2 = down
- 3 = right

## Running in the environment

In [None]:
s = env.reset()
env.render()
print('current state:', s)
s, r, done, info = env.step(0) # up
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(0) # up
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(0) # up, you won't move anywhere (but still recieve -1 reward)
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)

Reward -1 for every move is a way to force you to be fast.

### If we reach a trap we get a negative reward, if we reach a goal we get a positive reward

In [None]:
s = env.reset()
env.render()
print('current state:', s)
s, r, done, info = env.step(3) # right
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(0) # up, fallen into the pit (-5 reward)
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(3) # right, reached the goal state, receiving (5 reward)
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)

### An optimal path

In [None]:
s = env.reset()
env.render()
print('current state:', s)
s, r, done, info = env.step(3) # right
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(3) # right
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)
s, r, done, info = env.step(0) # up, reached the goal state, receiving reward of 5 (also -1)
env.render()
print('current state:', s, '| reward:', r, '| is done:', done)

# Random agent

`action_space` has a `sample()` method for uniform sampling an action.

In [None]:
env.action_space.sample()

Run many steps until termination (either reaching the goal, or being clipped)

In [None]:
s = env.reset()
while True:
    a = env.action_space.sample()
    s, r, done, info = env.step(a)
    if done: # episode ends
        print(info) # summary of the episode
        break

# Questions

Create a new cell below each question to type in the answer.

## Q1: Why do we want to clip the episode length?

Describe here ...

## Q2: What is an average epsiode length without clipping?
Hint: conduct an experiment to verify.