In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt

def get_epsilon_greedy_action(q_values, epsilon, action_n):
    policy = np.ones(action_n) * epsilon / action_n
    max_action = np.argmax(q_values)
    policy[max_action] += 1 - epsilon
    return np.random.choice(np.arange(action_n), p=policy)



In [2]:
%matplotlib notebook

In [3]:
env = gym.make('MountainCar-v0')
def desc_state(state):
    state = [np.round(state[0]/0.6,1),np.round(state[1]/0.035,1)]
    state = [(state[0]+2)*10,(state[1]+2)*10]
    state = np.array(np.round(state,1),dtype=np.int32)
    return state 

def env_reset(env):
    state = env.reset()
    state = desc_state(state)
    state = 41*state[0]+state[1]
    return state

def env_action(env,action):
    state, reward, done, _ = env.step(action)
    done = False
    if (state[0]>0.5):
        done = True
    #    reward = 100
    state = desc_state(state)
    state = 41*state[0]+state[1]
    return state, reward, done

def build_map(policy, states=[], actions=[]):
    res = np.zeros((41,41,3))
    for i in range(41):
        for j in range(41):
            actionss = policy[i*41+j]
            k = np.argmax(actionss)
            res[i,j,k] = 1
    if(len(states)):
        for state in states:
            j = state%41
            i = state//41
            res[i,j,:] = 1
            
    if(len(actions) and len(states)):
        for state,action in zip(states,actions):
            j = state%41
            i = state//41
            res[i,j,:] = 0
            #res[i,j,int(action)] = 0.5
    res = res.transpose((1,0,2))
    return res[-1::-1,:,:]

In [11]:
def SARSA(env, episode_n, gamma=0.9, trajectory_len=500, alpha=0.5,q = np.array([])):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 41*41
    action_n = 3
    if q.shape[0]:
        qfunction = q.copy()
    else:
        qfunction = np.zeros((state_n, action_n))
    epsilon = 1
    for episode in range(episode_n):
        epsilon -= 2 / episode_n
        if episode == episode_n//2:
            epsilon = 0.5
        if epsilon <= 0.001:
            epsilon = 0.001
        if episode == 4*episode_n//5:
            alpha /= 3
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
        if(episode%100 == 0):
            print(episode/episode_n,total_rewards[episode], epsilon)
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:],qfunction

In [12]:
total_rewards_s,episedes_total_s,q_s = SARSA(env, episode_n=20000, trajectory_len=1000, gamma=0.99, alpha=0.2)


0.0 -1000.0 0.9999
0.005 -1000.0 0.9899000000000011
0.01 -1000.0 0.9799000000000022
0.015 -1000.0 0.9699000000000033
0.02 -1000.0 0.9599000000000044
0.025 -1000.0 0.9499000000000055
0.03 -1000.0 0.9399000000000066
0.035 -1000.0 0.9299000000000077
0.04 -1000.0 0.9199000000000088
0.045 -1000.0 0.9099000000000099
0.05 -1000.0 0.899900000000011
0.055 -1000.0 0.8899000000000121
0.06 -1000.0 0.8799000000000132
0.065 -813.0 0.8699000000000143
0.07 -906.0 0.8599000000000154
0.075 -773.0 0.8499000000000165
0.08 -1000.0 0.8399000000000176
0.085 -1000.0 0.8299000000000187
0.09 -1000.0 0.8199000000000198
0.095 -1000.0 0.8099000000000209
0.1 -702.0 0.799900000000022
0.105 -735.0 0.7899000000000231
0.11 -442.0 0.7799000000000242
0.115 -835.0 0.7699000000000253
0.12 -522.0 0.7599000000000264
0.125 -414.0 0.7499000000000275
0.13 -352.0 0.7399000000000286
0.135 -829.0 0.7299000000000297
0.14 -431.0 0.7199000000000308
0.145 -706.0 0.709900000000032
0.15 -990.0 0.699900000000033
0.155 -722.0 0.6899000000

In [81]:
plt.plot(episedes_total_s,total_rewards_s)
plt.show()

<IPython.core.display.Javascript object>

In [7]:
def QLearning(env,episode_n, trajectory_len, noisy_episode_n, gamma=0.9, alpha_step_max=500, alpha=0.5, q = np.array([])):
    total_rewards = np.zeros(episode_n)
    episedes_total = np.zeros(episode_n+1)
    
    state_n = 41*41
    action_n = 3
    if q.shape[0]:
        qfunction = q.copy()
    else:
        qfunction = np.zeros((state_n, action_n))
    epsilon = 1
    for episode in range(episode_n):
        epsilon -= 2 / episode_n
        if episode == episode_n//2:
            epsilon = 0.5
        if epsilon <= 0.001:
            epsilon = 0.001
        if episode == 4*episode_n//5:
            alpha /= 3
        
        state = env_reset(env)
        action = get_epsilon_greedy_action(qfunction[state], epsilon, action_n)
        for _ in range(trajectory_len):
            next_state, reward, done = env_action(env,action)
            max_q = max([qfunction[next_state][next_action] for next_action in range(action_n)])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon = 0, action_n = action_n)
            
            qfunction[state][action] += alpha * (reward + gamma * qfunction[next_state][next_action] - qfunction[state][action])
            
            next_action = get_epsilon_greedy_action(qfunction[next_state], epsilon, action_n)
            state = next_state
            action = next_action
            
            episedes_total[episode+1] += 1
            total_rewards[episode] += reward
            
            if done:
                break
        
        if(episode%100 == 0):
            print(episode/episode_n,total_rewards[episode], epsilon)
                
        episedes_total[episode+1] += episedes_total[episode]

    return total_rewards, episedes_total[1:],qfunction

In [8]:
total_rewards_q,episedes_total_q,q_q  = QLearning(env, episode_n=20000, trajectory_len=1000, noisy_episode_n=7000, alpha_step_max=8000, gamma=0.99, alpha=0.2)


0.0 -1000.0 0.9999
0.005 -1000.0 0.9899000000000011
0.01 -1000.0 0.9799000000000022
0.015 -1000.0 0.9699000000000033
0.02 -1000.0 0.9599000000000044
0.025 -1000.0 0.9499000000000055
0.03 -1000.0 0.9399000000000066
0.035 -1000.0 0.9299000000000077
0.04 -1000.0 0.9199000000000088
0.045 -1000.0 0.9099000000000099
0.05 -1000.0 0.899900000000011
0.055 -1000.0 0.8899000000000121
0.06 -1000.0 0.8799000000000132
0.065 -1000.0 0.8699000000000143
0.07 -1000.0 0.8599000000000154
0.075 -1000.0 0.8499000000000165
0.08 -1000.0 0.8399000000000176
0.085 -848.0 0.8299000000000187
0.09 -1000.0 0.8199000000000198
0.095 -798.0 0.8099000000000209
0.1 -624.0 0.799900000000022
0.105 -452.0 0.7899000000000231
0.11 -1000.0 0.7799000000000242
0.115 -1000.0 0.7699000000000253
0.12 -398.0 0.7599000000000264
0.125 -412.0 0.7499000000000275
0.13 -411.0 0.7399000000000286
0.135 -562.0 0.7299000000000297
0.14 -427.0 0.7199000000000308
0.145 -401.0 0.709900000000032
0.15 -329.0 0.699900000000033
0.155 -400.0 0.6899000

In [15]:
plt.plot(np.convolve(total_rewards_q,np.ones(1)/1,mode='same'))
plt.grid(True)
plt.show()

<IPython.core.display.Javascript object>

In [76]:
state = env_reset(env)
action = get_epsilon_greedy_action(q_s[state], epsilon = 0, action_n = 3)
states = [state]
actions = [action]
total_rewardss = 0
for _ in range(600):
    next_state, reward, done = env_action(env,action)
    env.render()
    
    next_action = get_epsilon_greedy_action(q_s[next_state], epsilon = 0, action_n = 3)
    state = next_state
    action = next_action
    states.append(state)
    actions.append(action)
    total_rewardss += reward
    if done:
        break

In [77]:
total_rewardss

-98.0

In [78]:
plt.subplot(121)
res = build_map(q)
plt.imshow(res,extent=[-1.2,1.2,0.07,-0.07], aspect='auto')
plt.xlim((-1.2,0.6))
plt.subplot(122)
res = build_map(q_q,states,actions)
plt.imshow(res,extent=[-1.2,1.2,-0.07,0.07], aspect='auto')
plt.xlim((-1.2,0.6))

<IPython.core.display.Javascript object>

(-1.2, 0.6)

In [79]:
plt.savefig('car_s_good.png',bbox_inches='tight',dpi=300,facecolor='white')

In [93]:
def MonteCarlo(env, episode_n, trajectory_len=500, gamma=0.99):
    total_rewards = [0]
    episedes_total = [0]
    state_n = 41*41
    action_n = 3
    qfunction_act = np.zeros((state_n, action_n))
    qfunction_fill = np.zeros((state_n, action_n))
    counter = np.zeros((state_n, action_n))
    
    for episode in range(episode_n):
        epsilon = 1 - 2*episode / episode_n
        if(episode>=episode_n*3/8 and episode<=episode_n*4/8):
            epsilon = 0.1
        if epsilon <= 0:
            epsilon = 0.01
        if(episode>episode_n*5/6):
            epsilon = 0
        if(episode<episode_n*3/8 or episode>episode_n*4/8):
            qfunction_act = qfunction_fill.copy()
        if(episode==episode_n*3/8):
            qfunction_fill =  np.zeros((state_n, action_n))
        trajectory = {'states': [], 'actions': [], 'rewards': []}
        
        state = env_reset(env)
        for _ in range(trajectory_len):
            trajectory['states'].append(state)
            
            action = get_epsilon_greedy_action(qfunction_act[state], epsilon, action_n)
            trajectory['actions'].append(action)
            
            state, reward, done = env_action(env,action)
            trajectory['rewards'].append(reward)
            
            if done:
                break
        
        real_trajectory_len = len(trajectory['rewards'])
        
        total_rewards.append(sum(trajectory['rewards']))
        episedes_total.append(episedes_total[-1]+real_trajectory_len)
        if(episode%100==0):
            print(episode/episode_n,total_rewards[-1])
        returns = np.zeros(real_trajectory_len + 1)
        for t in range(real_trajectory_len - 1, -1, -1):
            returns[t] = trajectory['rewards'][t] + gamma * returns[t + 1]
        for t in range(real_trajectory_len):
            state = trajectory['states'][t]
            action = trajectory['actions'][t]
            qfunction_fill[state][action] += (returns[t] - qfunction_fill[state][action]) / (1 + counter[state][action])
            counter[state][action] += 1
            
    return total_rewards[1:], episedes_total[1:], qfunction_act
    

In [94]:
total_rewards_monte, episodes_monte, q_m = MonteCarlo(env, episode_n=60000, trajectory_len=1000, gamma=0.9999)

0.0 -1000.0
0.0016666666666666668 -1000.0
0.0033333333333333335 -1000.0
0.005 -1000.0
0.006666666666666667 -1000.0
0.008333333333333333 -1000.0
0.01 -1000.0
0.011666666666666667 -1000.0
0.013333333333333334 -1000.0
0.015 -1000.0
0.016666666666666666 -1000.0
0.018333333333333333 -1000.0
0.02 -1000.0
0.021666666666666667 -1000.0
0.023333333333333334 -1000.0
0.025 -1000.0
0.02666666666666667 -1000.0
0.028333333333333332 -1000.0
0.03 -1000.0
0.03166666666666667 -1000.0
0.03333333333333333 -1000.0
0.035 -1000.0
0.03666666666666667 -1000.0
0.03833333333333333 -1000.0
0.04 -1000.0
0.041666666666666664 -1000.0
0.043333333333333335 -1000.0
0.045 -1000.0
0.04666666666666667 -1000.0
0.04833333333333333 -1000.0
0.05 -1000.0
0.051666666666666666 -1000.0
0.05333333333333334 -1000.0
0.055 -1000.0
0.056666666666666664 -1000.0
0.058333333333333334 -1000.0
0.06 -1000.0
0.06166666666666667 -1000.0
0.06333333333333334 -1000.0
0.065 -1000.0
0.06666666666666667 -1000.0
0.06833333333333333 -1000.0
0.07 -1000

KeyboardInterrupt: 

In [89]:
plt.plot(total_rewards_monte)
plt.show()

<IPython.core.display.Javascript object>

In [96]:
fig = plt.figure()
ax = fig.add_subplot()
ax.plot(episedes_total_s,total_rewards_s,label='SARSA')
ax.plot(episedes_total_q,total_rewards_q,label='Q Learning')
ax.legend()
ax.set_ylabel('reward')
ax.set_xlabel('number of steps')
ax.grid(True)

<IPython.core.display.Javascript object>

In [97]:
plt.savefig('fig4_2.png',dpi=300,bbox_inches='tight',facecolor='white')