#### Sutton and Barto, Reinforcement Learning 2nd. Edition, page 120.
![Sutton and Barto, Reinforcement Learning 2nd. Edition.](./Figures/TD0Prediction.png)

Tabular TD(0) for estimating V

In [1]:
from rlgridworld.standard_grid import create_standard_grid
import numpy as np

Code to play game and return states visited along with reward

In [2]:
def play_game(gw, policy, epsilon ):
    # game starting state
    state = (0,0) 
    # list of tuples that are (state, reward) pairs
    # states_and_rewards = [(state,0)] # list of tuples that are (state, reward) pairs
    states_and_rewards = []
    converged = False
    while not converged:
        # get action from policy
        action = policy[state] # get action from policy
        # get all valid actions at state
        all_actions = gw.valid_decisions(state) 
        # choose a random action with probability epsilon
        action = random_action(action, all_actions, epsilon) 
        # find reward for the action
        reward = gw.get_reward_for_action(state, action)
        # new state
        stateprime = move(state,action)
        # add new state and reward to the list
        states_and_rewards.append((stateprime,reward))
        # if you have moved to a terminal state, then stop
        if gw.is_terminal(stateprime):
            converged = True
        # update state to new state
        state = stateprime
    return states_and_rewards

def move(state, action): # only valid actions at states are sent to move
    i,j = state
    if action == 'left':
        j = j-1
    if action == 'right':
        j = j+1
    if action == 'down':
        i = i-1
    if action == 'up':
        i = i+1
    return (i,j)

def random_action(action, all_actions, epsilon ):
    p = np.random.random_sample()
    if p < (1 - epsilon):
        return action
    else:
        return np.random.choice(all_actions)

Create standard grid - with probability 0.1 choose off policy

In [3]:
gw = create_standard_grid()
policy = { 
        (0,0):'up', (0,1):'right', (0,2):'right', (0,3):'up', 
        (1,0):'up', (1,1):'', (1,2):'right', (1,3):'', 
        (2,0):'right', (2,1):'right', (2,2):'right', (2,3):'' 
        }
gamma = 0.9 # discount factor for future rewards
alpha = 0.1 # fraction for value update
epsilon = 0.1 # explore/exploit probability of exploration
number_play_game = 1000 # number of game play iterations

## 1 - Play one episode of the game using the input policy

In [4]:
gw.print_policy(policy)

-------------------------------------
|  Right |  Right |  Right |        |
-------------------------------------
|     Up |        |  Right |        |
-------------------------------------
|     Up |  Right |  Right |     Up |
-------------------------------------


In [5]:
states_and_rewards = play_game(gw, policy, epsilon)

Examine the state path and rewards 

In [6]:
states_and_rewards # see the state path and rewards

[((1, 0), 0.0), ((2, 0), 0.0), ((2, 1), 0.0), ((2, 2), 0.0), ((2, 3), 1.0)]

Update values using TD(0)

In [7]:
for t in range(len(states_and_rewards) - 1 ):
    state, _ = states_and_rewards[t]
    stateprime, reward = states_and_rewards[t+1]
    current_value = gw.get_value(state)
    dest_value = gw.get_value(stateprime)
    new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
    gw.set_value(state, new_value)

Note the value of state (2,2) is updated to 0.1. The reason for the update is that alpha is 0.1. 

In [8]:
gw.print_values()

-------------------------------------
|   0.00 |   0.00 |   0.10 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------


Iteration Two

In [9]:
states_and_rewards = play_game(gw, policy, epsilon)

In [10]:
states_and_rewards

[((1, 0), 0.0), ((2, 0), 0.0), ((2, 1), 0.0), ((2, 2), 0.0), ((2, 3), 1.0)]

Update values

In [11]:
for t in range(len(states_and_rewards) - 1 ):
    state, _ = states_and_rewards[t]
    stateprime, reward = states_and_rewards[t+1]
    current_value = gw.get_value(state)
    dest_value = gw.get_value(stateprime)
    new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
    gw.set_value(state, new_value)

These are the values after the second iteration of the game.

In [12]:
gw.print_values()

-------------------------------------
|   0.00 |   0.01 |   0.19 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------


Play the game 1000 times

In [13]:
for _ in range(number_play_game):
    states_and_rewards = play_game(gw, policy, epsilon)
    for t in range(len(states_and_rewards) - 1 ):
        state, _ = states_and_rewards[t]
        stateprime, reward = states_and_rewards[t+1]
        current_value = gw.get_value(state)
        dest_value = gw.get_value(stateprime)
        new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
        gw.set_value(state, new_value)

Below the policy is printed again. The values estimated by TD(0) are then printed. 

In [14]:
print("Policy")
gw.print_policy(policy)
print("")
# Print values from TD Learning
print("Values from TD(0) Learning")
gw.print_values()

Policy
-------------------------------------
|  Right |  Right |  Right |        |
-------------------------------------
|     Up |        |  Right |        |
-------------------------------------
|     Up |  Right |  Right |     Up |
-------------------------------------

Values from TD(0) Learning
-------------------------------------
|   0.74 |   0.83 |   0.97 |   0.00 |
-------------------------------------
|   0.64 |   0.00 |  -0.92 |   0.00 |
-------------------------------------
|   0.42 |  -0.55 |  -0.79 |  -0.98 |
-------------------------------------
