In [131]:
import numpy as np

In [132]:
P = [
    [1, 0, 0, 0], 
    [0, 1, 0, 0], 
    [0.5, 0, 0.5, 0], 
    [0, 1, 0, 0]
]

R = [0, 0, 10, 10]

P = np.asmatrix(P)
R = np.asmatrix(R).T

gamma = 0.9
v = (np.eye(4) - gamma*P).I * R
print(v)

[[ 0.        ]
 [ 0.        ]
 [18.18181818]
 [10.        ]]


In [133]:
v = np.zeros([4, 1])
for _ in range(30):
    v = R + gamma * P*v
print(v)

[[ 0.        ]
 [ 0.        ]
 [18.18181818]
 [10.        ]]


In [134]:
states = [0, 1, 2, 3]
actions = [0, 1]
P = [
    [
        [(1, 0)],               # Action 0
        [(0.5, 0), (0.5, 1)]    # Action 1
    ], 
    [
        [(0.5, 0), (0.5, 3)], 
        [(1, 1)]
    ],
    [
        [(0.5, 0), (0.5, 2)], 
        [(0.5, 0), (0.5, 1)]
    ],
    [
        [(0.5, 2), (0.5, 3)],
        [(1, 1)]
    ]
]
R = [0, 0, 10, 10]
gamma = 0.9

print(P[2][0])

[(0.5, 0), (0.5, 2)]


In [135]:
v = [0, 0, 10, 10]
tmp = 0

for trans in P[2][0]:
    prob = trans[0]
    next_state = trans[1]
    tmp += prob * v[next_state]

print(tmp)

5.0


In [136]:
v = [0, 0, 0, 0]

def get_expected_value(state: int, action: int):
    val = 0
    for trans in P[state][action]:
        prob = trans[0]
        next_state = trans[1]
        val += prob * v[next_state]
    return val

for _ in range(100):
    for state in states:
        # Value for action 0
        q0 = get_expected_value(state=state, action=0)
        # Value for action 1
        q1 = get_expected_value(state=state, action=1)
        v[state] = R[state] + gamma * max(q0, q1)
print(v)

[31.58508953413495, 38.60400287377479, 44.02416232966445, 54.20158563176306]


In [137]:
"""
단순히 설명하자면, 그 전에 value function을 계산했으니 그 value function을 따라가도록 만드는 것
"""
optPolicy = [0, 0, 0, 0]

for state in states:
    q0 = get_expected_value(state=state, action=0)
    q1 = get_expected_value(state=state, action=1)
    optPolicy[state] = np.argmax([q0, q1])

In [138]:
policy = np.random.randint(0, 2, 4)
policy

array([1, 1, 0, 0])

In [139]:
def get_expected_policy_value(value: list, state: int, policy: list):
    val = 0
    for trans in P[state][policy[state]]:
        prob = trans[0]
        next_state = trans[1]
        val += prob * value[next_state]
    return val

def cal_value(policy):
    v = [0, 0, 0, 0]
    for _ in range(100):
        for state in states:
            next_val = get_expected_policy_value(value=v, state=state, policy=policy)
            v[state] = R[state] + gamma * next_val
    return v

[1 1 0 0]


In [140]:
v = cal_value(policy=policy)
print(v)

[0.0, 0.0, 18.18181818181818, 33.05785123966942]


In [141]:
for _ in range(100):
    for state in states:
        q1 = get_expected_value(state=state, action=0)
        q2 = get_expected_value(state=state, action=1)
        policy[state] = np.argmax([q1, q2])
    v = cal_value(policy=policy)
print(v)
print(policy)

[31.58508953413495, 38.60400287377479, 44.02416232966445, 54.20158563176306]
[1 0 0 0]


In [142]:
# Grid world example
P = {
     0: {0: [(0.9,0), (0.1,1), (0,4)], 
         1: [(0.8,1), (0.1,4), (0.1,0)], 
         2: [(0.8,4), (0.1,1), (0.1,0)], 
         3: [(0.9,0), (0.1,4)]},
     1: {0: [(0.8,1), (0.1,2), (0.1,0)], 
         1: [(0.8,2), (0.2,1)], 
         2: [(0.8,1), (0.1,0), (0.1,2)], 
         3: [(0.8,0), (0.2,1)]},
     2: {0: [(0.8,2), (0.1,3), (0.1,1)], 
         1: [(0.8,3), (0.1,5), (0.1,2)], 
         2: [(0.8,5), (0.1,1), (0.1,3)], 
         3: [(0.8,1), (0.1,2), (0.1,5)]},
     3: {0: [(0.9,3), (0.1,2)], 
         1: [(0.9,3), (0.1,6)], 
         2: [(0.8,6), (0.1,2), (0.1,3)], 
         3: [(0.8,2), (0.1,3), (0.1,6)]},
     4: {0: [(0.8,0), (0.2,4)], 
         1: [(0.8,4), (0.1,7), (0.1,0)], 
         2: [(0.8,7), (0.2,4)], 
         3: [(0.8,4), (0.1,0), (0.1,7)]},
     5: {0: [(0.8,2), (0.1,6), (0.1,5)], 
         1: [(0.8,6), (0.1,9), (0.1,2)], 
         2: [(0.8,9), (0.1,5), (0.1,6)], 
         3: [(0.8,5), (0.1,2), (0.1,9)]},
     6: {0: [(0.8,3), (0.1,6), (0.1,5)], 
         1: [(0.8,6), (0.1,10), (0.1,3)], 
         2: [(0.8,10), (0.1,5), (0.1,6)], 
         3: [(0.8,5), (0.1,3), (0.1,10)]},
     7: {0: [(0.8,4), (0.1,8), (0.1,7)], 
         1: [(0.8,8), (0.1,7), (0.1,4)], 
         2: [(0.9,7), (0.1,8)], 
         3: [(0.9,7), (0.1,4)]},
     8: {0: [(0.8,8), (0.1,9), (0.1,7)], 
         1: [(0.8,9), (0.2,8)], 
         2: [(0.8,8), (0.1,7), (0.1,9)], 
         3: [(0.8,7), (0.2,8)]},
     9: {0: [(0.8,5), (0.1,10), (0.1,8)], 
         1: [(0.8,9), (0.1,9), (0.1,5)], 
         2: [(0.8,9), (0.1,8), (0.1,10)], 
         3: [(0.8,8), (0.1,5), (0.1,9)]},
     10: {0: [(0.8,6), (0.1,10), (0.1,9)], 
          1: [(0.9,10), (0.1,6)], 
          2: [(0.9,10), (0.1,9)], 
          3: [(0.8,9), (0.1,6), (0.1,10)]}
}

R = [0, 0, 0, 1, 0, 0, -100, 0, 0, 0, 0]
gamma = 0.9

States = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Actions = [0, 1, 2, 3] # [north, east, south, west]

In [158]:
v = [0] * 11
def get_expected_value(state: int, action: int):
    val = 0
    for trans in P[state][action]:
        prob = trans[0]
        next_state = trans[1]
        val += prob * v[next_state]
    return val

for _ in range(100):
    for state in States:
        val_list = [get_expected_value(state=state, action=action) for action in Actions]
        v[state] = R[state] + gamma * max(val_list)

####### Lecture
# optimal policy
optPolicy = [0]*11

for s in States:       
    optPolicy[s] = np.argmax([sum([trans[0]*v[trans[1]] for trans in P[s][a]]) for a in Actions])

print(optPolicy)    
##########

policy = [0] * 11
# policy = np.random.randint(0, 4, 11)

def get_expected_policy_val(value: list, state: int, policy: list):
    val = 0
    for trans in P[state][policy[state]]:
        prob = trans[0]
        next_state = trans[1]
        val += prob * value[next_state]
    return val

def cal_value(policy: list):
    v = [0] * 11
    for _ in range(100):
        for state in States:
            next_val = get_expected_policy_val(v, state, policy)
            v[state] = R[state] + gamma * next_val
    return v
    
v = [0] * 11
for _ in range(100):
    for state in States:
        val_list = [get_expected_value(state, action) for action in Actions]
        policy[state] = np.argmax(val_list)
    v = cal_value(policy)

print(v)
print(policy)


[1, 1, 1, 0, 0, 3, 3, 0, 3, 3, 2]
[5.469912875154494, 6.313016756392044, 7.189835340201858, 8.668832741881708, 4.802848609089826, 3.346646423327193, -96.67286274565409, 4.161433424452139, 3.6539401588267464, 3.222016015185935, 1.5261933863955008]
[1, 1, 1, 0, 0, 3, 3, 0, 3, 3, 2]
