In [44]:
import numpy as np
from scipy.special import softmax

def show(obj, title): 
    if len(str(obj)) <= 20:
        print('\n',title+':',obj)
    else:
        print('\n',title+': \n',obj)

n_states = 3 ## imagine we have 3 tiles in the maze
states = list(range(n_states))
actions = ['up','down','left','right']
n_actions = len(actions)

""" 
    initialize actions and states
""" 
pi = np.zeros((n_actions, )).astype(int)
Q_sa = np.zeros((n_states, n_actions)).astype(int)

## randomly assign value functions
for i in range(n_states):
    Q_sa[i, np.random.choice(n_actions)] = 1
    
show(pi, 'randomly initialized pi')
show(Q_sa, 'randomly initialized Q(s,a)')


 randomly initialized pi: [0 0 0 0]

 randomly initialized Q(s,a): 
 [[1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]]


In [45]:
""" 
    softmax policy ensures all outputs are all between zero and one 
    by normalizing the candidates based on their exponential values.
    
    A higher temperature (tau) produces a softer probability distribution 
    over the choices, resulting in more exploration.
    
        softmax(x) = np.exp(x/tau) / sum(np.exp(x/tau))
"""

tau = 2
pi_exploration = softmax(Q_sa/tau, axis = 1) 
show(pi_exploration, 'softmax policy with high temperature '+str(tau))

tau = 0.0001
pi_exploitation = softmax(Q_sa/tau, axis = 1) 
show(pi_exploitation, 'softmax policy with low temperature '+str(tau))


 softmax policy with high temperature 2: 
 [[0.35466124 0.21511292 0.21511292 0.21511292]
 [0.21511292 0.35466124 0.21511292 0.21511292]
 [0.21511292 0.35466124 0.21511292 0.21511292]]

 softmax policy with low temperature 0.0001: 
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]


In [46]:
"""
    greedy policy: 
        choose the action that maximize the value function
        
    epsilon-greedy policy: 
        with probability epsilon, choose actions randomly 
        otherwise, choose according to policy
"""
pi = pi_exploitation
current_state = 0
epsilon = 0.3
if np.random.rand() <= epsilon:
    a = actions[np.random.choice(n_actions)]
else:
    prob = pi[current_state, :]
    index = np.random.choice(n_actions, p=prob)
    # index = np.argmax(prob)  ## use non-softmax policy
    a = actions[index]
    
show(a,'action chosen using '+str(epsilon)+'-greedy policy')


 action chosen using 0.3-greedy policy: left


In [49]:
"""
    
"""
a = actions[np.max(Q_sa[current_state, :])]
show(a,'action chosen using greedy policy')

1