## Lecture 17 - Reinforcement Learning 1

### 7. Value Iteration

#### Another Example of Value Iteration (Software Implementation)

In [6]:
import numpy as np
from nptyping import NDArray, Float, Shape

# Probability of staying in the current state.
# Since the agent can "begrudgingly" move to other states even when deciding not
# to move, there are probabilities to transition to other states starting from each step.
# The probability in each space consists of the product of the probability for "staying" 
# multiplied by the probability of "involuntarily" moving to a state S'
T_stay : NDArray[Shape["5, 5"], Float]  = np.array(
    [ #     S1         S2         S3         S4          S5
        [0.5*(1/2), 0.5*(1/4),    0.0   ,   0.0    ,    0.0   ],   # S1'
        [0.5*(1/2), 0.5*(3/4), 0.5*(1/4),   0.0    ,    0.0   ],   # S2'
        [   0.0   , 0.5*(1/4), 0.5*(3/4), 0.5*(1/4),    0.0   ],   # S3'
        [   0.0   ,    0.0   , 0.5*(1/4), 0.5*(3/4), 0.5*(1/2)],   # S4'
        [   0.0   ,    0.0   ,    0.0   , 0.5*(1/4), 0.5*(1/2)],   # S5'
    ]
)

# Probability of moving for each state
# The probability in each space consists of the product of the probability for "moving"
# multiplied by the probability of 
T_move : NDArray[Shape["5, 5"], Float] = np.array(
    [ #               S1                 S2         S3         S4                   S5
        [0.5*(2/3) + 0.5*(1/3)*(1/2), 0.5*(1/3),    0.0   ,    0.0   ,             0.0            ],   # S1'
        [0.5*(1/3) + 0.5*(2/3)*(1/2), 0.5*(2/3), 0.5*(1/3),    0.0   ,             0.0            ],   # S2'
        [           0.0             , 0.5*(1/3), 0.5*(2/3), 0.5*(1/3),             0.0            ],   # S3'
        [           0.0             ,   0.0    , 0.5*(1/3), 0.5*(2/3),       0.5*(1/3)*(1/2)      ],   # S4'
        [           0.0             ,   0.0    ,    0.0   , 0.5*(1/3), 0.5*(1/3)*(1/2) + 0.5*(2/3)],   # S5'
    ]
)

# Total probability (Sum of the probability to move and stay)
T : NDArray[Shape["5, 5"], Float] = T_move + T_stay

# Discount for the rewards
gamma : float = 0.5

# Total number of iterations
total_iterations : int = 100

# Reward for moving from one state to another
Rs : NDArray[Shape["1, 5"], Float] = np.array(
    [ # S1 S2 S3 S4 S5
        [0, 0, 0, 0, 0], # S1'
        [0, 0, 0, 0, 0], # S2'
        [0, 0, 0, 0, 0], # S3'
        [0, 0, 0, 0, 0], # S4'
        [1, 1, 1, 1, 1], # S5'
    ]
)

# State values
Vs : NDArray[Shape["1, 5"], Float] = np.array([[0, 0, 0, 0, 0]])

# Value Iteration Algorithm
for i in range(total_iterations):

    # Here we dont explicitly include just the values of "Vs" that correspond
    # to only the possible neighbors of state S. Here, all are included, however, 
    # the transition probabilities (T) for unreachable states from state S will be 
    # zero, so the algorithm will already be taking this into account
    Vs = np.sum(T * (Rs + gamma * Vs), axis = 0)

    print(i, Vs)

print("Final State Values:", Vs)

0 [0.         0.         0.         0.29166667 0.66666667]
1 [0.         0.         0.         0.48003472 1.        ]
2 [0.         0.         0.         0.60168909 1.16666667]
3 [0.         0.         0.         0.68025754 1.25      ]
4 [0.         0.         0.         0.73099966 1.29166667]
5 [0.         0.         0.         0.76377061 1.3125    ]
6 [0.         0.         0.         0.78493519 1.32291667]
7 [0.         0.         0.         0.79860398 1.328125  ]
8 [0.         0.         0.         0.80743173 1.33072917]
9 [0.         0.         0.         0.813133   1.33203125]
10 [0.         0.         0.         0.81681506 1.33268229]
11 [0.         0.         0.         0.81919306 1.33300781]
12 [0.         0.         0.         0.82072885 1.33317057]
13 [0.         0.         0.         0.82172072 1.33325195]
14 [0.         0.         0.         0.8223613  1.33329264]
15 [0.         0.         0.         0.822775   1.33331299]
16 [0.         0.         0.         0.82304219 1.