In [2]:
import sys
import gym
import numpy as np

In [3]:
env = gym.make('CliffWalking-v0')

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Discrete(48)

In [6]:
env.observation_space.sample()

44

In [7]:
#random policy

for i in range(3):
    state = env.reset()
    total_reward = 0
    print(state)
    while True:
        #print(state)
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        total_reward+=reward
        if done:
            print(total_reward)
            print("done\n")
            break

36
-96007
done

36
-58430
done

36
-174913
done



In [8]:
def max_index(q_state):
    max = 0
    index = 0
    
    for i in range(len(q_state)):
        if q_state[i]>max:
            max = q_state[i]
            index = i
    return index

In [23]:
def select_action(env,q_s,episode_i):
    
    epsilon = 1/episode_i
    probs = np.ones(env.action_space.n)*(epsilon/env.action_space.n)
    probs[np.argmax(q_s)] = (1-epsilon)+(epsilon/env.action_space.n)
    action = np.random.choice(np.arange(env.action_space.n),p=probs)
    return action

In [35]:
def sarsa(env,num_episodes,alpha,gamma=1):
    
    
    q = {}
    #Initialise q table
    for i in range(env.observation_space.n):
        q[i] = np.zeros(env.action_space.n)
    
    for episode_i in range(1,num_episodes+1):
        if episode_i % 100 == 0:
            print("\rEpisode {}/{}".format(episode_i, num_episodes), end="")
            sys.stdout.flush()
        
        #Initiate the episode
        state = env.reset()
        
        #Select action for initial state using epsilon greedy technique
        action = select_action(env,q[state],episode_i)
        
        #Limiting the number of time steps in an episode
        for t_step in range(300):
            new_state, reward, done, info = env.step(action)
            if done:
                q[state][action] = q[state][action]+alpha*(reward-q[state][action])
                break
            new_action = select_action(env,q[new_state],episode_i)
            d_reward = reward+gamma*q[new_state][new_action]
            q[state][action] = q[state][action]+alpha*(d_reward-q[state][action])
            state = new_state
            action = new_action
            
    return q

In [36]:
q = sarsa(env,5000,0.01)

Episode 5000/5000

In [37]:
q

{0: array([-10.83352852, -10.83775705, -10.83676747, -10.83982553]),
 1: array([-10.36098914, -10.36008403, -10.36510209, -10.36234077]),
 2: array([-9.73312883, -9.72146152, -9.72738514, -9.72966008]),
 3: array([-9.01531147, -9.01467954, -9.01333794, -9.01374689]),
 4: array([-8.26695582, -8.26336939, -8.26906108, -8.27488701]),
 5: array([-7.48992921, -7.4909628 , -7.49540621, -7.49335548]),
 6: array([-6.72090318, -6.70363689, -6.70519868, -6.71266003]),
 7: array([-5.92489337, -5.90740348, -5.90953497, -5.92318005]),
 8: array([-5.11055467, -5.11242399, -5.11303452, -5.11630275]),
 9: array([-4.33342821, -4.3239973 , -4.32247106, -4.32871794]),
 10: array([-3.56609815, -3.55580356, -3.55671498, -3.55324285]),
 11: array([-2.84852597, -2.85987167, -2.8499475 , -2.85527388]),
 12: array([-11.27544977, -11.27226963, -11.28389281, -11.27553848]),
 13: array([-10.61184319, -10.60870046, -10.61523635, -10.6136753 ]),
 14: array([-9.84695081, -9.84785618, -9.85161747, -9.85907336]),
 15:

In [55]:
def q_learning(env, num_episodes, alpha, gamma=1.0):
    
    q = {}
    #Initialise q table
    for i in range(env.observation_space.n):
        q[i] = np.zeros(env.action_space.n)
    
    for episode_i in range(1,num_episodes+1):
        
        if episode_i%100==0:
            print("\rEpisode: {}/{}".format(episode_i,num_episodes),end="")
            sys.stdout.flush()
        #Reset state for episode start
        state = env.reset()
        
        #Select initial action for the episode
        action = select_action(env,q[state],episode_i)
        
        #Limiting the length of the episode
        for t_step in range(300):
            new_state, reward, done, info = env.step(action)

            if done:
                q[state][action] = q[state][action]+alpha(reward-q[state][action])
                break
            else:
                d_reward = reward + gamma*(np.amax(q[new_state]))
                q[state][action] = q[state][action] + alpha*(d_reward-q[state][action])
                action = select_action(env,q[new_state],episode_i)
    return q

In [None]:
q_sarsa_max = q_learning(env,5000,0.01)

Episode: 1600/5000

In [54]:
q_sarsa_max

{0: array([0., 0., 0., 0.]),
 1: array([0., 0., 0., 0.]),
 2: array([0., 0., 0., 0.]),
 3: array([0., 0., 0., 0.]),
 4: array([0., 0., 0., 0.]),
 5: array([0., 0., 0., 0.]),
 6: array([0., 0., 0., 0.]),
 7: array([0., 0., 0., 0.]),
 8: array([0., 0., 0., 0.]),
 9: array([0., 0., 0., 0.]),
 10: array([0., 0., 0., 0.]),
 11: array([0., 0., 0., 0.]),
 12: array([0., 0., 0., 0.]),
 13: array([0., 0., 0., 0.]),
 14: array([0., 0., 0., 0.]),
 15: array([0., 0., 0., 0.]),
 16: array([0., 0., 0., 0.]),
 17: array([0., 0., 0., 0.]),
 18: array([0., 0., 0., 0.]),
 19: array([0., 0., 0., 0.]),
 20: array([0., 0., 0., 0.]),
 21: array([0., 0., 0., 0.]),
 22: array([0., 0., 0., 0.]),
 23: array([0., 0., 0., 0.]),
 24: array([0., 0., 0., 0.]),
 25: array([0., 0., 0., 0.]),
 26: array([0., 0., 0., 0.]),
 27: array([0., 0., 0., 0.]),
 28: array([0., 0., 0., 0.]),
 29: array([0., 0., 0., 0.]),
 30: array([0., 0., 0., 0.]),
 31: array([0., 0., 0., 0.]),
 32: array([0., 0., 0., 0.]),
 33: array([0., 0., 

In [None]:
def expected_sarsa(env,num_episodes,alpha,gamma=1):
    
    q = {}
    for i in range(env.observation_space.n):
        q[i] = 