In [1]:
import sys
import gym
import numpy as np

In [2]:
env = gym.make('CliffWalking-v0')

In [3]:
env.action_space

Discrete(4)

In [4]:
env.observation_space

Discrete(48)

In [5]:
env.observation_space.sample()

44

In [6]:
#random policy

for i in range(3):
    state = env.reset()
    total_reward = 0
    print(state)
    while True:
        #print(state)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        total_reward+=reward
        if done:
            print(total_reward)
            print("done\n")
            break

36
-96007
done

36
-58430
done

36
-174913
done



In [7]:
def max_index(q_state):
    max = 0
    index = 0
    
    for i in range(len(q_state)):
        if q_state[i]>max:
            max = q_state[i]
            index = i
    return index

In [8]:
def select_action(env,q_s,episode_i):
    
    epsilon = 1/episode_i
    probs = np.ones(env.action_space.n)*(epsilon/env.action_space.n)
    probs[np.argmax(q_s)] = (1-epsilon)+(epsilon/env.action_space.n)
    action = np.random.choice(np.arange(env.action_space.n),p=probs)
    return action

In [9]:
def sarsa(env,num_episodes,alpha,gamma=1):
    
    
    q = {}
    #Initialise q table
    for i in range(env.observation_space.n):
        q[i] = np.zeros(env.action_space.n)
    
    for episode_i in range(1,num_episodes+1):
        if episode_i % 100 == 0:
            print("\rEpisode {}/{}".format(episode_i, num_episodes), end="")
            sys.stdout.flush()
        
        #Initiate the episode
        state = env.reset()
        
        #Select action for initial state using epsilon greedy technique
        action = select_action(env,q[state],episode_i)
        
        #Limiting the number of time steps in an episode
        for t_step in range(300):
            new_state, reward, done, info = env.step(action)
            if done:
                q[state][action] = q[state][action]+alpha*(reward-q[state][action])
                break
            new_action = select_action(env,q[new_state],episode_i)
            d_reward = reward+gamma*q[new_state][new_action]
            q[state][action] = q[state][action]+alpha*(d_reward-q[state][action])
            state = new_state
            action = new_action
            
    return q

In [10]:
q = sarsa(env,5000,0.01)

Episode 5000/5000

In [11]:
q

{0: array([-10.85372504, -10.84408028, -10.83938087, -10.85024928]),
 1: array([-10.37033166, -10.36460965, -10.36538761, -10.37351232]),
 2: array([-9.74230081, -9.72583345, -9.73042918, -9.73824682]),
 3: array([-9.02495462, -9.01611296, -9.02080419, -9.0188189 ]),
 4: array([-8.27768661, -8.26704303, -8.26674348, -8.2754646 ]),
 5: array([-7.50961746, -7.49392401, -7.49406791, -7.49971065]),
 6: array([-6.71252954, -6.70539527, -6.70824613, -6.71878319]),
 7: array([-5.91523533, -5.91076341, -5.91438519, -5.91616394]),
 8: array([-5.11958623, -5.11526759, -5.11388636, -5.1274691 ]),
 9: array([-4.34285577, -4.32661934, -4.32463795, -4.3347765 ]),
 10: array([-3.55619177, -3.55890233, -3.55938789, -3.56337461]),
 11: array([-2.86831459, -2.86999461, -2.85178025, -2.85494346]),
 12: array([-11.27862475, -11.27409246, -11.28363429, -11.28545689]),
 13: array([-10.61318114, -10.60875763, -10.61673427, -10.62264871]),
 14: array([-9.85278422, -9.84828221, -9.8480089 , -9.86128294]),
 15:

In [16]:
def q_learning(env, num_episodes, alpha, gamma=1.0):
    
    q = {}
    #Initialise q table
    for i in range(env.observation_space.n):
        q[i] = np.zeros(env.action_space.n)
    
    for episode_i in range(1,num_episodes+1):
        
        if episode_i%100==0:
            print("\rEpisode: {}/{}".format(episode_i,num_episodes),end="")
            sys.stdout.flush()
        #Reset state for episode start
        state = env.reset()
        
        #Select initial action for the episode
        action = select_action(env,q[state],episode_i)
        
        #Limiting the length of the episode
        for t_step in range(300):
            new_state, reward, done, info = env.step(action)

            if done:
                q[state][action] = q[state][action]+alpha*(reward-q[state][action])
                break
            else:
                d_reward = reward + gamma*(np.amax(q[new_state]))
                q[state][action] = q[state][action] + alpha*(d_reward-q[state][action])
                action = select_action(env,q[new_state],episode_i)
                state = new_state
    return q

In [17]:
q_sarsa_max = q_learning(env,5000,0.01)

Episode: 5000/5000

In [18]:
q_sarsa_max

{0: array([-10.83845175, -10.83562333, -10.84255784, -10.83925423]),
 1: array([-10.3589916 , -10.35911895, -10.36467582, -10.36164715]),
 2: array([-9.72930908, -9.72288439, -9.72509914, -9.73568694]),
 3: array([-9.01882019, -9.01443372, -9.01915413, -9.01737972]),
 4: array([-8.26951944, -8.26460198, -8.26915433, -8.26505527]),
 5: array([-7.4897498 , -7.49099231, -7.49365883, -7.49820304]),
 6: array([-6.70820043, -6.70289107, -6.70381942, -6.71406212]),
 7: array([-5.90956328, -5.90747798, -5.90959139, -5.92093037]),
 8: array([-5.10982893, -5.11092051, -5.11571557, -5.1256182 ]),
 9: array([-4.32985196, -4.32246876, -4.3246565 , -4.33979857]),
 10: array([-3.55980713, -3.55642398, -3.55419035, -3.56437302]),
 11: array([-2.84975699, -2.84962079, -2.84915289, -2.86181069]),
 12: array([-11.27568654, -11.27090537, -11.27280119, -11.27869397]),
 13: array([-10.60918142, -10.60840264, -10.61016897, -10.60614191]),
 14: array([-9.85259423, -9.84811792, -9.84689854, -9.85486556]),
 15:

In [30]:
def find_cum_reward(env,q_s,episode_i):
    
    epsilon = 1/episode_i
    
    probs = np.ones(env.nA)*(epsilon/env.nA)
    probs[np.argmax(q_s)] = (1-epsilon)+(epsilon/env.nA)
    
    sum = 0
    for i in range(len(q_s)):
        sum+=(probs[i]*q_s[i])
        
    return sum

In [38]:
def expected_sarsa(env,num_episodes,alpha,gamma=1):
    
    #Initialise the Q table
    q = {}
    for i in range(env.observation_space.n):
        q[i] = np.zeros(env.nA)
    
    for episode_i in range(1,num_episodes+1):
        
        if episode_i%100==0:
            print("\rEpisode: {}/{}".format(episode_i,num_episodes),end="")
            sys.stdout.flush()
        #reset environment
        state = env.reset()
        
        #choose init action
        action = select_action(env,q[state],episode_i)
        
        for t_step in range(300):
            new_state,reward,done,info = env.step(action)
            
            if not done:
                d_reward = reward + find_cum_reward(env,q[new_state],episode_i)
                q[state][action] = q[state][action]+alpha*(d_reward-q[state][action])
                action = select_action(env,q[new_state],episode_i)
                state = new_state
            else:
                q[state][action] = q[state][action]+alpha*(reward-q[state][action])
                break
                
    return q
    

In [41]:
q_exsarsa = expected_sarsa(env,5000,0.01)

Episode: 5000/5000

In [42]:
q_exsarsa

{0: array([-10.84043556, -10.83846808, -10.84463802, -10.83919164]),
 1: array([-10.35782744, -10.35633396, -10.3673726 , -10.36778016]),
 2: array([-9.72009056, -9.72233838, -9.72784496, -9.72714661]),
 3: array([-9.01911095, -9.01183234, -9.01878301, -9.02593526]),
 4: array([-8.26020774, -8.26163985, -8.26833696, -8.2733428 ]),
 5: array([-7.49028031, -7.48735425, -7.49208925, -7.49737295]),
 6: array([-6.70050744, -6.69947219, -6.70599093, -6.69875008]),
 7: array([-5.91019093, -5.90554448, -5.90886767, -5.91137288]),
 8: array([-5.11020708, -5.10852857, -5.1140984 , -5.12485635]),
 9: array([-4.31998023, -4.31934426, -4.32260805, -4.32690499]),
 10: array([-3.55995129, -3.55279317, -3.55451778, -3.55193578]),
 11: array([-2.85014778, -2.84981728, -2.84849153, -2.85106004]),
 12: array([-11.27661797, -11.27188458, -11.28682457, -11.27903792]),
 13: array([-10.60789116, -10.60758399, -10.60902561, -10.62115936]),
 14: array([-9.85071751, -9.84861704, -9.84803553, -9.84776022]),
 15: