In [100]:
import numpy as np

# parameters 
S = 100
T = 80
alpha = 0.4
gamma = 0.8
epsilon = 0.4
k_plus = 0.0
k_minus = 0.0

# Q-tables will be of size 3 by 2
# 3 states each having 2 actions
def choose_action (Q_plus, Q_minus, eps, curr_state):
    q_plus = np.array(Q_plus)
    q_minus = np.array(Q_minus)
    summ = (np.array(q_plus[curr_state,:]) + np.array(q_minus[curr_state,:]))/2
    diff = (np.array(q_plus[curr_state,:]) - np.array(q_minus[curr_state,:]))
    U = summ + np.random.uniform(-0.5,0.5)*diff
    # print(U)
    if np.random.uniform(0,1) < eps:
        action = np.random.randint(0,2)
    else:
        action = np.argmax(U[:])
    return action

# updates the Q-values
def update (curr_action, curr_state, Q, next_action, next_state, r,k,Q_type):
    predict = Q[curr_state, curr_action]
    target = r + gamma*Q[next_state, next_action]
    Q[curr_state, curr_action] = Q[curr_state, curr_action] + alpha*piecewise(target - predict, k, Q_type) 
    
        
# for simplicity, Q_type is a string character indicating which Q we are updating
# Q_type = 'POS' or 'NEG'
def piecewise(TD_error, k, Q_type):
    if Q_type == 'POS':
        if TD_error >= 0:
            return (1+k)*TD_error
        else:
            return (1-k)*TD_error
    elif TD_error >= 0:
        return (1-k)*TD_error
    else:
        return (1+k)*TD_error

# helper function for the evolving reward probabilities
def reward_prob(T):
    q = np.zeros((2,2,T))
    q[0:2,0:2,0] = np.array([[0.75,0.25],
                [0.25,0.75]])
    for t in range(1,T):
        q[:,:,t] = q[:,:,t-1] + np.random.normal(0,0.025,[2,2])
        for i in range(2):
            for j in range(2):
                if q[i,j,t] >=1:
                    q[i,j,t] = 1 - (q[i,j,t] - 1)
                elif q[i,j,t]<=0:
                    q[i,j,t] = -q[i,j,t]
            
    return q

# function to encode the MDP structure - transition and reward
# return reward, next_state
def step(curr_state, curr_action, q, t):
    reward = 0
    next_state = curr_state
    # transition probabilities from 1st stage to 2nd stage
    p = np.array([[0.7, 0.3],[0.3, 0.7]])
    if curr_state == 0:
        if curr_action == 0:
            if np.random.uniform(0,1) < 0.7:
                next_state = 1
            else:
                next_state = 2
        else:
            if np.random.uniform(0,1) < 0.3:
                next_state = 1
            else:
                next_state = 2
    else:
        next_state = 0
        reward = q[curr_state-1,curr_action,t]
    return reward, next_state

Q = [[0,1],
    [-1,1],
    [0,10]]
q = reward_prob(T)


In [101]:
# training starts here
for s in range(S):
    Q_plus = np.zeros((3,2))
    Q_minus = np.zeros((3,2))
    curr_state = 0
    curr_action = choose_action(Q_plus, Q_minus, epsilon, curr_state)
    for t in range(T):
        # as per the SARSA-rule s,a,r,s',a'
        reward, next_state = step(curr_state,curr_action, q, t)
        next_action = choose_action(Q_plus, Q_minus, epsilon, next_state)
        
        # update the Q-values
        
        update(curr_action, curr_state, Q_plus, next_action, next_state, reward, k_plus, "POS")
        update(curr_action, curr_state, Q_minus, next_action, next_state, reward, k_minus, 'NEG')
        
        # update action, states
        curr_action = next_action
        curr_state  = next_state
print(Q_plus,"and", Q_minus)
        

[[1.23655378 0.92285785]
 [1.64860478 0.6023763 ]
 [0.57382482 1.5894643 ]] and [[1.23655378 0.92285785]
 [1.64860478 0.6023763 ]
 [0.57382482 1.5894643 ]]
