# Dishwasher robot

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (12.9, 12)

np.set_printoptions(suppress=True, precision=2)

sns.set(font_scale=3.5)

First we will write the transition and reward matrices and dictionaries that map actions as numbers to strings

In [11]:
actions = {'grab':0, 'dry': 1, 'store':2}
states = {'no plate held': 0, 'wet plate held': 1, 'dry plate held':2, 'finished':3}

actions_r = {0: 'grab', 1: 'dry', 2:'store'}
states_r = {0: 'no plate held', 1: 'wet plate held', 2: 'dry plate held', 3:'finished'}

In [119]:
pf = 0.10 
P0 = np.array(((0.0, 1.0, 0.0, 0.0), (0.0, 1.0, 0.0, 0.0), (0.0, 1.0, 0.0, 0.0), (0.0, 0.0, 0.0, 1.0)))
P1 = np.array(((1.0, 0.0, 0.0, 0.0), (0.0, pf, 1-pf, 0.0), (0.0, pf, 1-pf, 0.0), (0.0, 0.0, 0.0, 1.0)))
P2 = np.array(((0.0, 0.0, 0.0, 1.0), (0.0, 0.0, 0.0, 1.0), (0.0, 0.0, 0.0, 1.0), (0.0, 0.0, 0.0, 1.0)))

P = np.array((P0, P1, P2))

# The probability of having a wet plate when you grab a plate
P[actions['grab'], states['no plate held'], states['wet plate held']]

1.0

In [121]:
R = np.array(((0, 0, 0), (-20, pf*(-20.0), 5), (-20, pf*(-20.0), 10), (0, 0, 0)))

# The expected reaward of storing a plate when we are helding a dry plate 
print(R[states['dry plate held'], actions['store']])

10.0


## Policiy evaluation
#### Dynamic programming

In [145]:
pi = np.array(((1, 0, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0)))

pi[states['wet plate held']]

array([1, 0, 0])

In [146]:
def policy_evalution_step(V, gamma, states, actions, pi, P, R):

    v = np.zeros_like(V)
    
    for s in states.values():
        value_aux = 0
        for a_ in actions.values():
            auxs = 0
            for s_ in states.values():
                auxs += P[a_, s, s_] * (R[s, a_] + gamma * V[s_])

            value_aux += pi[s, a_] * auxs 

        v[s] = value_aux
    return v

In [147]:
V = np.zeros(len(states))
gamma = 1.0
V = policy_evalution_step(V, gamma, states, actions, pi, P, R)
print(V)

[  0. -20. -20.   0.]


This agrees with the results from the class. If we doo policy iteration more, we see that the values of the states where we have a plate (both wet and dry) keep getting worse, that is, because every time we grab a new plate over there we break the old and pay a high price

In [148]:
V = policy_evalution_step(V, gamma, states, actions, pi, P, R)
print(V)

V = policy_evalution_step(V, gamma, states, actions, pi, P, R)
print(V)

V = policy_evalution_step(V, gamma, states, actions, pi, P, R)
print(V)

[-20. -40. -40.   0.]
[-40. -60. -60.   0.]
[-60. -80. -80.   0.]


#### Policy impovement

In [149]:
def calculate_Q(V, gamma, states, actions, pi, P, R):

    Q = np.zeros((len(states), len(actions)))
    for s in states.values():
        for a_ in actions.values():
            auxs = 0
            for s_ in states.values():
                auxs += P[a_, s, s_] * (R[s, a_] + gamma * V[s_])

            Q[s, a_] = auxs

    return Q

In [161]:
V = np.zeros(len(states))
gamma = 1.0
V = policy_evalution_step(V, gamma, states, actions, pi, P, R)
print(V)
V = np.zeros(len(states))
Q = calculate_Q(V, gamma, states, actions, pi, P, R)
Q

[  0. -20. -20.   0.]


array([[  0.,   0.,   0.],
       [-20.,  -2.,   5.],
       [-20.,  -2.,  10.],
       [  0.,   0.,   0.]])

In [162]:
states

{'dry plate held': 2, 'finished': 3, 'no plate held': 0, 'wet plate held': 1}

In [163]:
actions

{'dry': 1, 'grab': 0, 'store': 2}

In [164]:
winner = np.argmax(Q, axis=1)
pi_new = np.zeros((len(states), len(actions)))
for index, win in enumerate(winner):
    pi_new[index, win] = 1  

pi_new

array([[ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

In [165]:
V = policy_evalution_step(V, gamma, states, actions, pi_new, P, R)
Q = calculate_Q(V, gamma, states, actions, pi_new, P, R)
print('V', V)
print('Q', Q)

V [  0.   5.  10.   0.]
Q [[  5.    0.    0. ]
 [-15.    7.5   5. ]
 [-15.    7.5  10. ]
 [  0.    0.    0. ]]


In [166]:
winner = np.argmax(Q, axis=1)
pi_new = np.zeros((len(states), len(actions)))
for index, win in enumerate(winner):
    pi_new[index, win] = 1  

pi_new

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

In [167]:
V = policy_evalution_step(V, gamma, states, actions, pi_new, P, R)
Q = calculate_Q(V, gamma, states, actions, pi_new, P, R)
print('V', V)
print('Q', Q)

V [  5.    7.5  10.    0. ]
Q [[  7.5    5.     0.  ]
 [-12.5    7.75   5.  ]
 [-12.5    7.75  10.  ]
 [  0.     0.     0.  ]]


In [168]:
winner = np.argmax(Q, axis=1)
pi_new = np.zeros((len(states), len(actions)))
for index, win in enumerate(winner):
    pi_new[index, win] = 1  

pi_new

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

#### Alternative evalution

In [136]:
import mdptoolbox

In [139]:
pi = mdptoolbox.mdp.PolicyIteration(P, R, 0.9)
pi.run()

In [140]:
pi.policy

(0, 1, 2, 0)

In [118]:
actions

{'dry': 1, 'grab': 0, 'store': 2}