# Markov Decision Process

### State
- 0  1  2  3
- 4  W  5  6
- 7  8  9 10
- 3이 최종 목적지
- 6은 가서는 안되는 자리

In [1]:
import numpy as np
states = [0,1,2,3,4,5,6,7,8,9,10] 
N_STATES = len(states)

### Action
- 0: left
- 1: right
- 2: up
- 3: down

In [2]:
actions = [0, 1, 2, 3]  # left, right, up, down
N_ACTIONS = len(actions) 

### Transition Probability
- You move according to your action with 80% probability.
- Your move may have a left and rigth one click error with 10% probability each.
- If there is a barrier against your move, your move bounds back.

In [3]:
P = np.empty((N_STATES, N_ACTIONS, N_STATES))

#                0   1   2   3   4   5   6   7   8   9  10
P[ 0, 0, :] = [ .9,  0,  0,  0, .1,  0,  0,  0,  0,  0,  0 ]
P[ 0, 1, :] = [ .1, .8,  0,  0, .1,  0,  0,  0,  0,  0,  0 ]
P[ 0, 2, :] = [ .9, .1,  0,  0,  0,  0,  0,  0,  0,  0,  0 ]
P[ 0, 3, :] = [ .1, .1,  0,  0, .8,  0,  0,  0,  0,  0,  0 ]

P[ 1, 0, :] = [ .8, .2,  0,  0,  0,  0,  0,  0,  0,  0,  0 ] 
P[ 1, 1, :] = [  0, .2, .8,  0,  0,  0,  0,  0,  0,  0,  0 ]
P[ 1, 2, :] = [ .1, .8, .1,  0,  0,  0,  0,  0,  0,  0,  0 ]
P[ 1, 3, :] = [ .1, .8, .1,  0, .8,  0,  0,  0,  0,  0,  0 ]

P[ 2, 0, :] = [  0, .8, .1,  0,  0, .1,  0,  0,  0,  0,  0 ]
P[ 2, 1, :] = [  0,  0, .1, .8,  0, .1,  0,  0,  0,  0,  0 ]
P[ 2, 2, :] = [  0, .1, .8, .1,  0,  0,  0,  0,  0,  0,  0 ]
P[ 2, 3, :] = [  0, .1,  0, .1, .8, .8,  0,  0,  0,  0,  0 ]

P[ 3, 0, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0 ]
P[ 3, 1, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0 ]
P[ 3, 2, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0 ]
P[ 3, 3, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0 ]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 4, 0, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0 ]
P[ 4, 1, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0 ]
P[ 4, 2, :] = [ .8,  0,  0,  0, .2,  0,  0,  0,  0,  0,  0 ]
P[ 4, 3, :] = [  0,  0,  0,  0, .2,  0,  0, .8,  0,  0,  0 ]

P[ 5, 0, :] = [  0,  0, .1,  0,  0, .8,  0,  0,  0, .1,  0 ]
P[ 5, 1, :] = [  0,  0, .1,  0,  0,  0, .8,  0,  0, .1,  0 ]
P[ 5, 2, :] = [  0,  0, .8,  0,  0, .1, .1,  0,  0,  0,  0 ]
P[ 5, 3, :] = [  0,  0,  0,  0,  0, .1, .1, .8,  0, .1,  0 ]

P[ 6, 0, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0 ]
P[ 6, 1, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0 ]
P[ 6, 2, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0 ]
P[ 6, 3, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0 ]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 7, 0, :] = [  0,  0,  0,  0, .1,  0,  0, .9,  0,  0,  0 ]
P[ 7, 1, :] = [  0,  0,  0,  0, .1,  0,  0, .1, .8,  0,  0 ]
P[ 7, 2, :] = [  0,  0,  0,  0, .8,  0,  0, .1, .1,  0,  0 ]
P[ 7, 3, :] = [  0,  0,  0,  0,  0,  0,  0, .9, .1,  0,  0 ]

P[ 8, 0, :] = [  0,  0,  0,  0,  0,  0,  0, .8, .2,  0,  0 ]
P[ 8, 1, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .2, .8,  0 ]
P[ 8, 2, :] = [  0,  0,  0,  0,  0,  0,  0, .1, .8, .1,  0 ]
P[ 8, 3, :] = [  0,  0,  0,  0, .8,  0,  0, .1, .8, .1,  0 ]

P[ 9, 0, :] = [  0,  0,  0,  0,  0, .1,  0,  0, .8, .1,  0 ]
P[ 9, 1, :] = [  0,  0,  0,  0,  0, .1,  0,  0,  0, .1, .8 ]
P[ 9, 2, :] = [  0,  0,  0,  0,  0, .8,  0,  0, .1,  0, .1 ]
P[ 9, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .1, .8, .1 ]

P[10, 0, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0, .8, .1 ]
P[10, 1, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0,  0, .9 ]
P[10, 2, :] = [  0,  0,  0,  0,  0,  0, .8,  0,  0, .1, .1 ]
P[10, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0,  0, .1, .9 ]

### Reward

- -0.02 for each action (battery consumption)
- If you reach the state 3, you win and get the final reward 1 at the end step.
- If you reach the state 6, you lose and get the final reward -1 at the end step.

In [4]:
if True:   # 정상 동작 (배터리가 충분) 
    R = -0.02 * np.ones((N_STATES, N_ACTIONS))
else:      # 비정상 동작 (배터리가 부족) 
    R = -0.5 * np.ones((N_STATES, N_ACTIONS))
R[3, :] = 1
R[6, :] = -1

### Discount factor

In [5]:
gamma = 0.99

### Policy
- When you are at state s, there are many actions you can choose.
- Policy describes how you choose your action.

In [6]:
policy = np.empty((N_STATES, N_ACTIONS)) 
policy[0, :] = [0,1,0,0]  # ->
policy[1, :] = [0,1,0,0]  # ->
policy[2, :] = [0,1,0,0]  # ->
policy[3, :] = [0,1,0,0]  # 1
policy[4, :] = [0,0,0,1]  # v (down) 
policy[5, :] = [0,1,0,0]  # ->
policy[6, :] = [0,1,0,0]  # -1
policy[7, :] = [0,1,0,0]  # ->
policy[8, :] = [0,1,0,0]  # ->
policy[9, :] = [0,0,1,0]  # ^ (up) 
policy[10,:] = [0,0,1,0]  # ^ (up) 