In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

def get_epsilon_greedy_action(q_values, epsilon, action_n):
    policy = np.ones(action_n) * epsilon / action_n
    max_action = np.argmax(q_values)
    policy[max_action] += 1 - epsilon
    return np.random.choice(np.arange(action_n), p=policy)



In [2]:
%matplotlib notebook

In [3]:
env = gym.make('MountainCar-v0')
def desc_state(state):
    state = [np.round(state[0]/1.2,1),np.round(state[1]/0.07,1)]
    state = [(state[0]+1)*10,(state[1]+1)*10]
    state = np.array(np.round(state,1),dtype=np.int32)
    return state 

def env_reset(env):
    state = env.reset()
    state = desc_state(state)
    state = 21*state[0]+state[1]
    return state

def env_action(env,action):
    state, reward, done, _ = env.step(action)
    done = False
    if (state[0]>0.5):
        done = True
    #    reward = 100
    state = desc_state(state)
    state = 21*state[0]+state[1]
    return state, reward, done

In [4]:
def SARSA(env, episode_n, gamma=0.9, trajectory_len=500, alpha=0.5):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 21*21
    action_n = 3
    qfunction = np.zeros((state_n, action_n))
    
    for episode in range(episode_n):
        epsilon = 1 / (episode + 1)
        #epsilon = 1 - episode/(episode_n)
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:]

In [5]:
total_rewards,episedes_total = SARSA(env, episode_n=2000, trajectory_len=1000, gamma=0.9, alpha=0.8)

In [6]:
plt.plot(episedes_total,total_rewards)
plt.show()

<IPython.core.display.Javascript object>

In [4]:
def QLearning(env,episode_n, trajectory_len, noisy_episode_n, gamma=0.9, t_max=500, alpha=0.5):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 21*21
    action_n = 3
    qfunction = np.zeros((state_n, action_n))
    
    for episode in range(episode_n):
        epsilon = 0.9 / (episode/300 + 1)+0.1
        if episode >= noisy_episode_n:
            epsilon = 0
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            max_q = max([qfunction[next_state][next_action] for next_action in range(action_n)])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon = 0, action_n = action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
        
        if(episode%100 == 0):
            print(episode/episode_n,total_rewards[episode], epsilon)
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:],qfunction

In [None]:
total_rewards,episedes_total,q  = QLearning(env, episode_n=100000, trajectory_len=500, noisy_episode_n=80000, t_max=1000, gamma=1, alpha=0.7)


0.0 -500.0 1.0
0.001 -500.0 0.775
0.002 -500.0 0.64
0.003 -500.0 0.55
0.004 -500.0 0.48571428571428577
0.005 -500.0 0.4375
0.006 -500.0 0.4
0.007 -500.0 0.37
0.008 -495.0 0.34545454545454546
0.009 -500.0 0.325
0.01 -399.0 0.3076923076923077
0.011 -500.0 0.2928571428571429
0.012 -500.0 0.28
0.013 -500.0 0.26875000000000004
0.014 -500.0 0.2588235294117647
0.015 -500.0 0.25
0.016 -500.0 0.24210526315789474
0.017 -500.0 0.23500000000000001
0.018 -500.0 0.2285714285714286
0.019 -500.0 0.22272727272727275
0.02 -500.0 0.2173913043478261
0.021 -271.0 0.21250000000000002
0.022 -369.0 0.20800000000000002
0.023 -368.0 0.20384615384615384
0.024 -460.0 0.2
0.025 -500.0 0.19642857142857142
0.026 -500.0 0.19310344827586207
0.027 -500.0 0.19
0.028 -398.0 0.1870967741935484
0.029 -419.0 0.184375
0.03 -212.0 0.18181818181818182
0.031 -500.0 0.17941176470588235
0.032 -391.0 0.17714285714285716
0.033 -500.0 0.175
0.034 -416.0 0.17297297297297298
0.035 -500.0 0.17105263157894737
0.036 -278.0 0.169230769230

In [306]:
plt.plot(episedes_total,np.convolve(total_rewards,np.ones(100)/100,mode='same'))
plt.show()

<IPython.core.display.Javascript object>

In [295]:
plt.plot(q[:,0])
plt.plot(q[:,1])
plt.plot(q[:,2])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2161a63eaa0>]

In [302]:
state = env_reset(env)
action = get_epsilon_greedy_action(q[state], epsilon = 0, action_n = 3)
total_rewards = 0
for _ in range(600):
    next_state, reward, done = env_action(env,action)
    env.render()
    
    next_action = get_epsilon_greedy_action(q[next_state], epsilon = 0, action_n = 3)
    state = next_state
    action = next_action
    total_rewards += reward
    if done:
        break

In [303]:
total_rewards

-283.0