In [1]:
import gym
import numpy as np

from IPython.display import clear_output

In [2]:
env = gym.make("MountainCar-v0")

In [3]:
print(f"Observation Space Low : {env.observation_space.low}")
print(f"Observation Space High: {env.observation_space.high}")

print(f"\nAction Space: {env.action_space}")

Observation Space Low : [-1.2  -0.07]
Observation Space High: [0.6  0.07]

Action Space: Discrete(3)


In [4]:
discrete_os_shape = [18, 14]
discrete_os_win_size = abs(env.observation_space.high - env.observation_space.low) / discrete_os_shape

print(f"Discrete OS Window Size: {discrete_os_win_size}")

Discrete OS Window Size: [0.1  0.01]


In [5]:
def getDiscreteState(state):
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    
    return tuple(discrete_state.astype(int))

In [6]:
def getEpsilonGreedyPolicy(Q, epsilon, nA):
    def policy(state):
        A = np.ones(nA) * (epsilon / nA)
        best_action = np.argmax(Q[state])
        A[best_action] = A[best_action] + (1 - epsilon)
        
        return A
    return policy

In [7]:
def QLearning(env, num_episodes, discount=1.0, alpha=0.1, epsilon=0.1):
    Q = np.zeros(discrete_os_shape + [env.action_space.n])
    policy = getEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
    
    episode_rewards = np.zeros(num_episodes)
    episode_finished = []
    
    for i_episode in range(num_episodes):
        state = env.reset()
        discrete_state = getDiscreteState(state)
        
        for t in range(200):
#             if (i_episode + 1) % 5000 == 0:
#                 env.render()
            
            action_prob = policy(discrete_state)
            action_pos = [i for i in range(env.action_space.n)]
            action = np.random.choice(action_pos, p=action_prob)
            
            next_state, reward, done, _ = env.step(action)
            discrete_next_state = getDiscreteState(next_state)
            
            episode_rewards[i_episode] = episode_rewards[i_episode] + reward
            
            best_next_action = np.argmax(Q[discrete_next_state])
            
            td_target = reward + discount * Q[discrete_next_state][best_next_action]
            td_delta = td_target - Q[discrete_state][action]
            Q[discrete_state][action] = Q[discrete_state][action] + alpha * td_delta
            
            if done:
                if episode_rewards[i_episode] > -200:
                    episode_finished.append([i_episode, episode_rewards[i_episode]])
                break
                
            discrete_state = discrete_next_state
        
        clear_output(True)
        print(f"[{i_episode + 1:>5}/{num_episodes:>5}] Episode Total Reward: {episode_rewards[i_episode]}")
            
    env.close()
    
    return Q, episode_finished

In [8]:
Q, episode_finished = QLearning(env, 10000, alpha=0.05, epsilon=0.0005)

[10000/10000] Episode Total Reward: -112.0


In [9]:
print("Episodes Finished:")
for episode in episode_finished[-5:]:
    print(f"[{episode[0] + 1:>5}] Total Reward: {episode[1]}")

Episodes Finished:
[ 9996] Total Reward: -110.0
[ 9997] Total Reward: -161.0
[ 9998] Total Reward: -160.0
[ 9999] Total Reward: -112.0
[10000] Total Reward: -112.0


In [10]:
def playEnvOnce(env, Q):
    state = env.reset()
    discrete_state = getDiscreteState(state)
    
    for t in range(200):
        env.render()
        
        action = np.argmax(Q[discrete_state])
        
        next_state, reward, done, _ = env.step(action)
        discrete_next_state = getDiscreteState(next_state)
        
        if done:
            break
            
        discrete_state = discrete_next_state
        
    env.close()

In [11]:
playEnvOnce(env, Q)