In [None]:
import numpy as np

from scipy import linalg

In [None]:
n_states = 6

# transition matrix together with policy

P_pi = np.zeros((n_states, n_states))
P_pi




array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
R = np.zeros_like(P_pi)
R

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

###Create the transition matrix by considering a random policy:

In [None]:
P_pi[0, 1] = 0.5

P_pi[0, 3] = 0.5

P_pi[1, 2] = 0.5

P_pi[1, 5] = 0.5

P_pi[2, 4] = 0.5

P_pi[2, 5] = 0.5

P_pi[4, 5] = 0.5

P_pi[4, 0] = 0.5

P_pi[3, 0] = 0.5

P_pi[3, 3] = 0.5

P_pi[5, 5] = 1


In [None]:
P_pi

array([[0. , 0.5, 0. , 0.5, 0. , 0. ],
       [0. , 0. , 0.5, 0. , 0. , 0.5],
       [0. , 0. , 0. , 0. , 0.5, 0.5],
       [0.5, 0. , 0. , 0.5, 0. , 0. ],
       [0.5, 0. , 0. , 0. , 0. , 0.5],
       [0. , 0. , 0. , 0. , 0. , 1. ]])

####Create the Reward Matrix

In [None]:
R[0, 1] = -2

R[0, 3] = -1

R[1, 2] = -2

R[1, 5] = 0

R[2, 4] = 15

R[2, 5] = 10

R[4, 5] = 10

R[4, 0] = -10

R[3, 3] = -1

R[3, 0] = -3

In [None]:
R

array([[  0.,  -2.,   0.,  -1.,   0.,   0.],
       [  0.,   0.,  -2.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,  15.,  10.],
       [ -3.,   0.,   0.,  -1.,   0.,   0.],
       [-10.,   0.,   0.,   0.,   0.,  10.],
       [  0.,   0.,   0.,   0.,   0.,   0.]])

###Being a probability matrix, the sum of all the columns of P_pi should be 1:

In [None]:
# check the correctness of P_pi

assert((np.sum(P_pi, axis=1) == 1).all())

In [None]:
# expected reward for each state

R_expected = np.sum(P_pi * R, axis=1, keepdims=True)

R_expected

array([[-1.5],
       [-1. ],
       [12.5],
       [-2. ],
       [ 0. ],
       [ 0. ]])

The R_expected vector contains the expected immediate reward for each state.


In [None]:
# Now it is possible to solve the Bellman Equation

gamma = 0.9

A = np.eye(n_states, n_states) - gamma * P_pi

B = R_expected

# solve using scipy linalg

V = linalg.solve(A, B)

V

array([[-1.78587056],
       [ 4.46226255],
       [12.13836121],
       [-5.09753046],
       [-0.80364175],
       [ 0.        ]])

This is the vector of the state values. State 0 has a value of -1.7, state 1 has a value of 4.4, and so on:

Let's examine how the results change with , which is the condition assumed for a myopic random student:

In [None]:
gamma = 0.

A = np.eye(n_states, n_states) - gamma * P_pi

B = R_expected

# solve using scipy linalg

V_gamma_zero = linalg.solve(A, B)

V_gamma_zero

array([[-1.5],
       [-1. ],
       [12.5],
       [-2. ],
       [ 0. ],
       [ 0. ]])

As you can see, using , the value of each state is exactly equal to the expected immediate reward according to the policy.

Now we can calculate the action-value function. We need to use a different form of immediate reward using a matrix with a shape of  formula |S .A,1|. Each row corresponds to a state-action pair, and the value is the immediate reward for that pair:

In [None]:
R_sa = np.zeros((n_states*2, 1))

R_sa



array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [None]:
R_sa[0] = -2 # study in state 0

R_sa[1] = -1 # social in state 0

R_sa[2] = -2 # study in state 1

R_sa[3] = 0 # sleep in state 1

R_sa[4] = 10 # sleep in state 2

R_sa[5] = +15 # beer in state 2

R_sa[6] = -1 # social in state 3 (social)

R_sa[7] = -3 # study in state 3 (social)

R_sa[8] = 10 # sleep in state 4 (pub)

R_sa[9] = -10 # study in state 4 (pub)

In [None]:
R_sa.shape

(12, 1)

We now have to define the transition matrix of the student MDP. The transition matrix contains the probability of landing in a given state, starting from a state and an action. In the rows, we have the source state and action, and in the columns, we have the landing state:

In [None]:
# Transition Matrix (states x action, states)

P = np.zeros((n_states*2, n_states))

P

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
P[0, 1] = 1 # study in state 0 -> state 1

P[1, 3] = 1 # social in state 0 -> state 3

P[2, 2] = 1 # study in state 1 -> state 2

P[3, 5] = 1 # sleep in state 1 -> state 5 (bed)

P[4, 5] = 1 # sleep in state 2 -> state 5 (bed)

P[5, 4] = 1 # beer in state 2 -> state 4 (pub)

P[6, 3] = 1 # social in state 3 -> state 3 (social)
P[7, 0] = 1 # study in state 3 -> state 0 (Class 1)

P[8, 5] = 1 # sleep in state 4 -> state 5 (bed)

P[9, 0] = 1 # study in state 4 -> state 0 (class 1)



We can now calculate the action-value function using gamma=0.9

In [None]:
gamma = 0.9

Q_sa_pi = R_sa + gamma * P @ V

Q_sa_pi

array([[  2.01603629],
       [ -5.58777741],
       [  8.92452509],
       [  0.        ],
       [ 10.        ],
       [ 14.27672242],
       [ -5.58777741],
       [ -4.60728351],
       [ 10.        ],
       [-11.60728351],
       [  0.        ],
       [  0.        ]])

The action-value vector contains the above values. Q_sa_pi is the action-value vector. For each state-action pair, we have the value of the action in that state.

---



---



We are now interested in extracting the best action for each state:

In [None]:
"""

reshape the column so that we obtain a vector with shape (n_states, n_actions)

"""

n_actions = 2

Q_sa_pi2 = np.reshape(Q_sa_pi, (-1, n_actions))

Q_sa_pi2

array([[  2.01603629,  -5.58777741],
       [  8.92452509,   0.        ],
       [ 10.        ,  14.27672242],
       [ -5.58777741,  -4.60728351],
       [ 10.        , -11.60728351],
       [  0.        ,   0.        ]])

In [None]:
#In this way, performing the argmax function, we obtain the index of the best action in each state:

best_actions = np.reshape(np.argmax(Q_sa_pi2, -1), (-1, 1))

best_actions

array([[0],
       [0],
       [1],
       [1],
       [0],
       [0]])

The best_actions vector contains the above values.