# Starting by going through Gym tutorial.

We're looking to understand the agent system through Gym to later be able to implement custom environments to interact with.

In [11]:
import gym
import numpy as np
from IPython.display import clear_output

## Rendering an agent taking random action.

In [2]:
# Basic setup
env = gym.make('MountainCar-v0')
env.reset()

for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

## `env.step()` returns:
- `observation` object ("game state")
- reward (float)
- done (bool)
- info (dict)

In [6]:
# Try and complete challenge with random actions.
env = gym.make('MountainCar-v0')

for i_episode in range(20):
    observation = env.reset()
    for t in range(1,100+1):
        env.render()
        
        clear_output(wait=True) # clear output and only show last observation
        print(observation)
        action = env.action_space.sample() # choose action
        observation, reward, done, info = env.step(action) # take step
        
        if done:
            print(f'Episode complete after {t} timesteps.')
            break
env.close()

[-0.57012457  0.00546369]


## Underbelly of `env`

In [7]:
env = gym.make('MountainCar-v0')
print(f'Action Space:\n\t{env.action_space}')
print(f'Observation Space:\n\t{env.observation_space}')

Action Space:
	Discrete(3)
Observation Space:
	Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)


In [8]:
print(f'Observation Space Bounds')
print(f'High\t', env.observation_space.high)
print(f'Low\t', env.observation_space.low)

Observation Space Bounds
High	 [0.6  0.07]
Low	 [-1.2  -0.07]


## Controlling the action

In [9]:
env = gym.make('MountainCar-v0')
env.reset()

done = False
while not done:
    action = 2 # always choose to move the car to the right
    observation, reward, done, info = env.step(action)
    env.render()
env.close()

# Training an agent

In [57]:
env = gym.make('MountainCar-v0')
env.reset()

# Create a q-table.
obs_size = [20, 20] # somewhat arbitrary
win_size = (env.observation_space.high - env.observation_space.low) / obs_size # create discrete vars from continuous vars

q_table = np.random.uniform(low=-2, high=0, size=(obs_size + [env.action_space.n])) # all rewards are -1 unless you reach the goal
q_table.shape

(20, 20, 3)

In [58]:
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low) / win_size
    return (round(discrete_state[0]), round(discrete_state[1]))

test_state = get_discrete_state(env.reset())
print('Discrete State:\t', test_state)
print('Q-Table Entry:\t', q_table[test_state])
print('Max Q-value at State:\t', np.argmax(q_table[test_state]))

Discrete State:	 (9, 10)
Q-Table Entry:	 [-0.17913958 -1.76265802 -0.72261911]
Max Q-value at State:	 0


In [59]:
learning_rate = 0.1
discount = 0.95 # measure of how we value future reward vs current reward
episodes = 5_000
show_every = 500

In [60]:
for episode in range(episodes):
    if episode % show_every == 0:
        clear_output(wait=True)
        print(episode)
        render = True
    else:
        clear_output(wait=True)
        print(episode)
        render = False
    discrete_state = get_discrete_state(env.reset())

    done = False
    while not done:
        action = np.argmax(q_table[discrete_state])
        observation, reward, done, info = env.step(action)
        new_discrete_state = get_discrete_state(observation)
        if render:
            env.render()

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q_val = q_table[discrete_state + (action,)]

            # Calculating new q_values
            new_q = (1 - learning_rate) * current_q_val + learning_rate * (reward + discount * max_future_q)
            
            q_table[discrete_state + (action,)] = new_q
        elif observation[0] >= env.goal_position:
            q_table[discrete_state + (action,)] = 0 # Reward for completion.

        discrete_state = new_discrete_state

    env.close()

4999
