# SARSA algorithm: On-policy TD control. Finds the optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, stats).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

In [1]:
pip install --upgrade gym==0.19.0

Note: you may need to restart the kernel to use updated packages.




In [2]:
import numpy as np
from collections import defaultdict
from windy_gridworld import WindyGridworldEnv

# This is a standard grid-world, with start and goal states, but with one diﬀerence: there is a crosswind upward through the middle of the grid. The actions are the standard four — up, down,right, and left — but in the middle region the resultant next states are shifted upward by a “wind,” the strength of which varies from column to column. The strength of the wind is given below each column, in number of cells shifted upward. For example, if you are one cell to the right of the goal, then the action left takes you to the cell just above the goal. Let us treat this as an undiscounted episodic task, with constant rewards of −1 until the goal state is reached.

In [3]:
pip install --upgrade numpy

Note: you may need to restart the kernel to use updated packages.




In [4]:
env = WindyGridworldEnv()
nA = env.action_space.n
nB=env.observation_space.n
epsilon = 0.1
gamma = 1.0
alpha=0.1

In [5]:
nA


4

In [6]:
nB

70

In [7]:
def get_epision_greedy_action_policy(Q,observation):
    
    A = np.ones(nA, dtype=float) * epsilon / nA
    best_action = np.argmax(Q[observation])
    A[best_action] += (1.0 - epsilon)
    
    return A

With the help of choice() method, we can get the random samples of one dimensional array and return the random samples of numpy array.

Syntax : numpy.random.choice(a, size=None, replace=True, p=None)

Parameters:

1) a – 1-D array of numpy having random samples.

2) size – Output shape of random samples of numpy array.

3) replace – Whether the sample is with or without replacement.

4) p – The probability attach with every samples in a. 



In [8]:
def sarsa(total_episodes):
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    for k in range(total_episodes):
        
        current_state = env.reset()
        # The policy we're following
        prob_scores = get_epision_greedy_action_policy(Q,current_state)
        
        # sample action following epsilon greedy policy
        current_action = np.random.choice(np.arange(nA), p=prob_scores)
        
        while True:
            # Perform the action -> Get the reward and observe the next state 
            next_state, reward, done, _ = env.step(current_action)
            
            prob_scores_next_state = get_epision_greedy_action_policy(Q,next_state)
            
             # Choose the action for the next state following our current policy
            next_action = np.random.choice(np.arange(nA), p=prob_scores_next_state)
            
            # value that we should have got
            td_target = reward + gamma * Q[next_state][next_action]
            td_error = td_target - Q[current_state][current_action]
            
              # SARSA update
            Q[current_state][current_action] = Q[current_state][current_action] + alpha * td_error
    
            if done:
                break
            
            # Update current state
            current_state = next_state        
            current_action = next_action
    return Q

In [9]:
Q = sarsa(100)

In [10]:
#here is the output, the output array contains the values for every action for every state
#A function that takes the observation as an argument and returns
        #the probabilities for each action in the form of a numpy array of length nA.
Q

defaultdict(<function __main__.sarsa.<locals>.<lambda>()>,
            {30: array([-6.75022711, -6.73180308, -6.69706656, -6.76651886]),
             20: array([-7.06406892, -6.91213625, -6.86325296, -6.9427506 ]),
             10: array([-7.2644577 , -7.26094489, -7.213956  , -7.41726457]),
             21: array([-6.7144231 , -6.83955798, -6.73597922, -6.79729104]),
             11: array([-7.2567834 , -7.24984702, -7.17687721, -7.23151416]),
             1: array([-7.71986104, -7.64695175, -7.55006933, -7.62223619]),
             2: array([-8.18631008, -8.09019449, -8.06505731, -8.0602675 ]),
             3: array([-8.85661618, -8.89708766, -8.77506107, -8.79742876]),
             4: array([-9.41529913, -9.46846287, -9.40135412, -9.35959525]),
             5: array([-9.49199668, -9.51913427, -9.62701117, -9.49879515]),
             6: array([-8.91920892, -8.89581905, -9.04014548, -8.94235146]),
             7: array([-8.04597363, -7.97570202, -8.12579311, -8.34572518]),
            