In [1]:
import numpy as np

## DP solution

In [2]:
proba = 0.5 * np.array([
    [0, 0, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 0, 0, 0],
    [0, 1, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1, 0],
    [0, 0, 0, 0, 1, 0, 1],
    [0, 0, 0, 0, 0, 0, 0],
])

rewards = np.array([0, 0, 0, 0, 0, 0.5, 0]).T

np.linalg.inv(np.eye(N=7) - proba) @ rewards

array([0.        , 0.16666667, 0.33333333, 0.5       , 0.66666667,
       0.83333333, 0.        ])

## Monte Carlo

In [3]:
n_trials = 1_000

state_rewards = {i: [] for i in range(7)}

gamma = 1.

for _ in range(n_trials):
    current_state = 3  # Starting state 
    step = 0
    first_visit = {current_state: 0}
    
    state_history = [current_state]
    reward_history = [0]
    
    while current_state not in (0, 6):
        step += 1
        next_state = current_state + np.random.choice([-1, 1])
        if not next_state in first_visit:
            first_visit[next_state] = step
        
        state_history.append(next_state)
        if next_state == 6:
            reward_history.append(1)
        else:
            reward_history.append(0)
        
        current_state = next_state 
        
    # Makes updates from the end of the episode
    total_reward = 0
    for i in range(step, -1, -1):
        total_reward = (reward_history[i] + gamma * total_reward)
        # Is this the first visit?
        if first_visit[state_history[i]] == i:
            state_rewards[state_history[i]].append(total_reward)

state_values_mc = {k: np.mean(v) for k, v in state_rewards.items()}
state_values_mc

{0: 0.0,
 1: 0.17792421746293247,
 2: 0.341688654353562,
 3: 0.501,
 4: 0.6779431664411367,
 5: 0.8549488054607508,
 6: 1.0}

## TD

In [4]:
state_values_td = {i: 0 for i in range(7)}

alpha = 0.1

for _ in range(n_trials):
    current_state = 3  # Starting state 
    
    while current_state not in (0, 6):
        next_state = current_state + np.random.choice([-1, 1])
        reward = 1 if next_state == 6 else 0
        
        state_values_td[current_state] += alpha * (reward + gamma * state_values_td[next_state] - state_values_td[current_state])
        current_state = next_state
        
state_values_td

{0: 0,
 1: 0.19474867055697664,
 2: 0.3474823346423158,
 3: 0.46470878605967475,
 4: 0.6839359721925832,
 5: 0.9176225189141505,
 6: 0}