# Epsilon greedy policy-MC On Policy


the best known action based on our experience is selected with (1-epsilon) probability and the rest of time i.e. with epsilon probability any action is selected randomly.

initially epsilon is 1 so we can explore more but as we do many iterations we slowly decrease the epsilon to 0 ( which is exploitation → choosing the best known action)

epsilon is between 0 and 1

In [14]:
import numpy as np
import sys
from collections import defaultdict


# get_epision_greedy_action_policy(Q,observation):
 
 Args:
 
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
            
        epsilon: The probability to select a random action . float between 0 and 1.
            
        nA: Number of actions in the environment.
            
        Returns:
        
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

In [16]:
epsilon=0.1
nA=2

In [17]:
def get_epision_greedy_action_policy(Q,observation):
    
    #Choose a random action with probability epsilon / nA
    A = np.ones(nA, dtype=float) * epsilon / nA
    
    # Get the action values corresponding to the observation(action_values = Q[observation]) & then Get the greedy/best action    
    best_action = np.argmax(Q[observation])
    
    # Choose the greedy action with probability (1 - epsilon)   
    A[best_action] += (1.0 - epsilon)
    
    #return the probability scores for each action
    return A

# mc_control_epsilon_greedy(total_episodes):
        
      
    Finds an optimal epsilon-greedy policy.
    
    Args:
        
        total_episodes: Number of episodes to sample.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns action probabilities

In [18]:
def mc_control_epsilon_greedy(total_episodes):
  
    returns_sum = defaultdict(float)
    
    ## store the number of times each state is visited 
    states_count = defaultdict(float)
    
    ## Action Value function to be returned 
    # where Number of actions = env.action_space.n
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    for k in range(total_episodes):
        
        episode = generate_episode(Q)
        
        state_actions_in_episode = list(set([(sar[0], sar[1]) for sar in episode]))
        
        for i,sa_pair in enumerate(state_actions_in_episode):
            state, action = sa_pair
    
            G = sum([sar[2] for i,sar in enumerate(episode[i:])])
            
            #Taking the means of episodes to calculate mean values
            returns_sum[sa_pair] += G
            states_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / states_count[sa_pair]
        
        
    return Q

In [19]:
#This generates the episode by following the epsilon greedy policy
def generate_episode(Q):
    episode = []
    current_state = env.reset()
    
    while(True):
        
        # The optimal policy to be returned
        prob_scores = get_epision_greedy_action_policy(Q,current_state)
          
         # sample the action from the epsilon greedy policy
        action = np.random.choice(np.arange(len(prob_scores)), p=prob_scores) #0 or 1
        
         # perform the action in the environment
        next_state, reward, done, _ = env.step(action)
        episode.append((current_state, action, reward))
        if done:
            break
         # update the current state
        current_state = next_state    
        
    return episode

The output array contains the value function score for all the actions for each state ( here we have only two actions 0 and 1)

In [20]:
mc_control_epsilon_greedy(50000)

defaultdict(<function __main__.mc_control_epsilon_greedy.<locals>.<lambda>()>,
            {(15, 2, False): array([-0.28333333, -0.52      ]),
             (15, 10, True): array([-0.46236559, -0.28571429]),
             (13, 5, False): array([-0.12348668, -0.16666667]),
             (21, 10, False): array([ 0.8861912, -1.       ]),
             (15, 10, False): array([-0.58308867, -0.61904762]),
             (16, 10, False): array([-0.58318099, -0.6372549 ]),
             (20, 3, False): array([ 0.6612529, -1.       ]),
             (15, 9, False): array([-0.5862069 , -0.50802139]),
             (13, 10, True): array([-1.        , -0.26973684]),
             (13, 4, False): array([-0.18210863, -0.22857143]),
             (14, 10, False): array([-0.61538462, -0.52352591]),
             (12, 6, False): array([-0.11638955, -0.28      ]),
             (15, 7, False): array([-0.34134615, -0.59090909]),
             (20, 9, False): array([ 0.78063241, -0.85185185]),
             (15, 3, Fals

Action value function tells us how good is it to take that action