In [1]:
import numpy as np
import pandas as pd
from gridworld import Environment, Agent
import matplotlib.pyplot as plt

In [2]:
def update_q(q, eligibility_trace, alpha, gamma, state_prime, state, reward_prime, action):
    delta = reward_prime + gamma*np.max(q[state_prime,:])-q[state, action]
    q = q + alpha*delta*eligibility_trace
    return q
    
def update_eligibilty_trace(q, eligibility_trace, gamma,lambd, state, action):
    if q[state, action]== q[state,:].max():
        greedy = 1
    else:
        greedy = 0
    eligibility_trace_prime = greedy*gamma*lambd*eligibility_trace
    eligibility_trace_prime[state, action] =  greedy*gamma*lambd*eligibility_trace[state,action]+1
    return eligibility_trace_prime

def update_policy(q):
    policy = q.argmax(axis = 1)
    return policy

def initialize():
    q = np.zeros((12,4))
    eligibility_trace = np.zeros((12,4))
    agent = Agent()
    agent.policy = np.random.randint(0,4,(12,)) # A random policy to be the policy to be followed: policy \mu
    env = Environment(random_initial_state=True)
    state = env.current_state
    reward = env.initial_reward
    episode_finished=False
    return agent, env, state, reward, episode_finished

In [30]:
def sarsa(convergence_criterion =20000, alpha=0.01, gamma=0.999, lambd = 0.0, epsilon = None):
    n_iters = 0
    same_policy_iter = 0
    n_episodes = 0
    q = np.zeros((12,4))
    eligibility_trace = np.zeros((12,4))
    agent, env, state, reward, episode_finished = initialize()
    no_action_states = env.impossible_states + env.terminal_states
    agent.policy = np.array([3, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 0]) # A suboptimal policy
    current_policy = agent.policy # Just to initialize it
    print('The policy being used')
    agent.render_policy()
    while same_policy_iter < convergence_criterion:
        action = agent.step(state, epsilon = epsilon)
        state_prime, reward_prime, episode_finished = env.step(action)
        eligibility_trace = update_eligibilty_trace(q, eligibility_trace, gamma,lambd, state, action)
        q = update_q(q, eligibility_trace, alpha,gamma,state_prime, state, reward_prime, action)
        previous_policy = current_policy
        current_policy = update_policy(q)
        state, reward = state_prime, reward_prime
        previous_policy = previous_policy.ravel()
        previous_policy[no_action_states] = 0
        current_policy = current_policy.ravel()
        current_policy[no_action_states] = 0
        
        if np.array_equal(previous_policy,current_policy):
            same_policy_iter += 1
        else:
            same_policy_iter = 0
        n_iters += 1
        if n_iters%50000 == 0:
            print('Iteration {} ---- Current policy same for {} iterations'.format(n_iters, same_policy_iter))
            agent.render_policy(custom_policy = current_policy)
        if episode_finished:
            n_episodes += 1
            agent, env, state, reward, episode_finished = initialize()
    print('Final iteration number {}'.format(n_iters))
    agent.render_policy(custom_policy = current_policy)

In [32]:
sarsa(epsilon=0.2, lambd = 0.00)

The policy being used
[['v' '<' '<' '>']
 ['v' '*' 'v' '*']
 ['>' '>' '>' '*']]


Iteration 50000 ---- Current policy same for 8324 iterations
[['v' '<' 'v' '<']
 ['v' '*' 'v' '*']
 ['>' '>' '>' '*']]


Final iteration number 61676
[['v' '<' 'v' '<']
 ['v' '*' 'v' '*']
 ['>' '>' '>' '*']]


