In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

def get_epsilon_greedy_action(q_values, epsilon, action_n):
    policy = np.ones(action_n) * epsilon / action_n
    max_action = np.argmax(q_values)
    policy[max_action] += 1 - epsilon
    return np.random.choice(np.arange(action_n), p=policy)



In [2]:
%matplotlib notebook

In [3]:
env = gym.make('MountainCar-v0')
def desc_state(state):
    state = [np.round(state[0]/1.2,1),np.round(state[1]/0.07,1)]
    state = [(state[0]+1)*10,(state[1]+1)*10]
    state = np.array(np.round(state,1),dtype=np.int32)
    return state 

def env_reset(env):
    state = env.reset()
    state = desc_state(state)
    state = 21*state[0]+state[1]
    return state

def env_action(env,action):
    state, reward, done, _ = env.step(action)
    done = False
    if (state[0]>0.5):
        done = True
    #    reward = 100
    state = desc_state(state)
    state = 21*state[0]+state[1]
    return state, reward, done

In [111]:
def SARSA(env, episode_n, gamma=0.9, trajectory_len=500, alpha=0.5):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 21*21
    action_n = 3
    qfunction = np.zeros((state_n, action_n))
    
    for episode in range(episode_n):
        epsilon = 1 / (episode + 1)
        #epsilon = 1 - episode/(episode_n)
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:]

In [112]:
total_rewards,episedes_total = SARSA(env, episode_n=2000, trajectory_len=1000, gamma=1, alpha=0.8)

plt.plot(episedes_total,total_rewards)
plt.show()

<IPython.core.display.Javascript object>

In [4]:
def QLearning(env,episode_n, trajectory_len, noisy_episode_n, gamma=0.9, t_max=500, alpha=0.5):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 21*21
    action_n = 3
    qfunction = np.zeros((state_n, action_n))
    
    for episode in range(episode_n):
        epsilon = 0.1
        if episode >= noisy_episode_n:
            epsilon = 0
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            max_q = max([qfunction[next_state][next_action] for next_action in range(action_n)])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon = 0, action_n = action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
        
        if(episode%100 == 0):
            print(episode/episode_n,total_rewards[episode], epsilon)
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:],qfunction

In [None]:
total_rewards,episedes_total,q  = QLearning(env, episode_n=10000, trajectory_len=500, noisy_episode_n=8000, t_max=1000, gamma=0.99, alpha=0.5)


0.0 -500.0 0.1
0.01 -500.0 0.1
0.02 -353.0 0.1
0.03 -205.0 0.1
0.04 -367.0 0.1
0.05 -393.0 0.1
0.06 -315.0 0.1
0.07 -500.0 0.1
0.08 -396.0 0.1
0.09 -378.0 0.1


In [306]:
plt.plot(episedes_total,np.convolve(total_rewards,np.ones(100)/100,mode='same'))
plt.show()

<IPython.core.display.Javascript object>

In [295]:
plt.plot(q[:,0])
plt.plot(q[:,1])
plt.plot(q[:,2])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2161a63eaa0>]

In [302]:
state = env_reset(env)
action = get_epsilon_greedy_action(q[state], epsilon = 0, action_n = 3)
total_rewards = 0
for _ in range(600):
    next_state, reward, done = env_action(env,action)
    env.render()
    
    next_action = get_epsilon_greedy_action(q[next_state], epsilon = 0, action_n = 3)
    state = next_state
    action = next_action
    total_rewards += reward
    if done:
        break

In [303]:
total_rewards

-283.0