In [1]:
import numpy as np
import copy

## Reward definition

In [2]:
reward = np.zeros((4, 4, 2))
reward[0, 1, 0] = 1.0
reward[0, 2, 0] = 2.0
reward[0, 0, 1] = 0.0
reward[1, 0, 0] = 1.0
reward[1, 2, 0] = 2.0
reward[1, 1, 1] = 1.0
reward[2, 0, 0] = 1.0
reward[2, 1, 0] = 0.0
reward[2, 2, 1] = -1.0

### MDP setting
* state: index of visiting count to each place
  * 0: Home, 1: Office, 2: Bar 3: End State
* MDP array describes the probabilities for move to next state if choiced moving
* if count of Home = $n$, transit to End State

In [3]:
transition_prob = [0.8, 0.5, 1.0, 0.0]

In [4]:
end_state = 3


def get_next_state(current_place, current_count, next_prob_arr, unif, end_count):
    next_prob = next_prob_arr[current_place]
    
    if sum(current_count) == end_count:
        next_place = end_state
        current_count[end_state] += 1
    elif unif <= next_prob:
        next_place = (current_place + 1) % 3
        current_count[next_place] += 1
    else:
        next_place = (current_place + 2) % 3
        current_count[next_place] += 1

    return next_place, current_count

In [5]:
current_count = [0, 0, 0, 0]
get_next_state(0, current_count, transition_prob, 0.5, 5)

(1, [0, 1, 0, 0])

* policy definition ... move: 0, stay: 1

In [6]:
move = 0
stay = 1
def policy(state, p, unif):
    move_prob = p[state]
    if unif <= move_prob:
        return move
    else:
        return stay

In [7]:
pi = [0.5, 0.5, 0.5]

In [8]:
init_state = 0
end_count = 5
max_time_step = 1000

In [9]:
s = init_state
count = [0, 0, 0, 0]
a = 0
r = 0.0
state_hist = []
action_hist = []
reward_hist = []

In [10]:
for i in range(0, max_time_step):
    unif = np.random.uniform()
    a = policy(s, pi, unif)
    
    if a == move:
        unif = np.random.uniform()
        next_s, count = get_next_state(s, count, transition_prob, unif, end_count)
    else:
        next_s = s

    if next_s == end_state:
        break
        
    r = reward[s, next_s, a]
    
    state_hist.append(s)
    action_hist.append(a)
    reward_hist.append(r)
    
    print('total count:', sum(count), ' current state:', s, ' action:', a, ' next state:', next_s, ' reward:', r)
    print(count)
            
    s = next_s

total count: 1  current state: 0  action: 0  next state: 1  reward: 1.0
[0, 1, 0, 0]
total count: 2  current state: 1  action: 0  next state: 2  reward: 2.0
[0, 1, 1, 0]
total count: 3  current state: 2  action: 0  next state: 0  reward: 1.0
[1, 1, 1, 0]
total count: 3  current state: 0  action: 1  next state: 0  reward: 0.0
[1, 1, 1, 0]
total count: 4  current state: 0  action: 0  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 4  current state: 1  action: 1  next state: 1  reward: 1.0
[1, 2, 1, 0]
total count: 5  current state: 1  action: 0  next state: 2  rewar

In [11]:
len(state_hist), state_hist

(13, [0, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2])

In [12]:
len(reward_hist), reward_hist

(13, [1.0, 2.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, -1.0])

In [13]:
def indicator(state, value):
    return state == value

In [14]:
def update(target_state, cummulative_reward, current_state, current_value, learning_rate):
    update_value = cummulative_reward - current_value
    return current_value + learning_rate * update_value * indicator(current_state, s)

In [15]:
discount = 0.9


def cumulative_reward_monte_carlo(state_hist, reward_hist):
    ret = 0.0
    for s, r in zip(state_hist[::-1], reward_hist[::-1]):
        ret = r + discount * ret
    return ret

In [16]:
cumulative_reward_monte_carlo(state_hist, reward_hist)

7.3780856947990001

In [23]:
r_hist = copy.copy(reward_hist)
s_hist = copy.copy(state_hist)
cum_func = cumulative_reward_monte_carlo

target_state = 0
learning_rate = 0.01

val = 0.0
while len(r_hist) > 0:
    cum = cum_func(s_hist, r_hist)
    current_state = s_hist[0]
    print('state:', current_state, ' cum_reward,', cum)
    val = update(target_state, cum, current_state, val, learning_rate)
    r_hist = r_hist[1:]
    s_hist = s_hist[1:]

state: 0  cum_reward, 7.3780856948
state: 1  cum_reward, 7.08676188311
state: 2  cum_reward, 5.6519576479
state: 0  cum_reward, 5.168841831
state: 0  cum_reward, 5.74315759
state: 1  cum_reward, 5.2701751
state: 1  cum_reward, 4.744639
state: 1  cum_reward, 4.16071
state: 1  cum_reward, 3.5119
state: 1  cum_reward, 2.791
state: 1  cum_reward, 1.99
state: 1  cum_reward, 1.1
state: 2  cum_reward, -1.0


In [18]:
val

0.04595438071421