In [None]:
import numpy as np

actions = {'up': (0, -1), 'down': (0, 1), 'left': (-1, 0), 'right': (1, 0)}

def attribute(grid, state):
    return grid[state[1]][state[0]]

def calc_reward(grid, state, reward = -0.04):
    r = attribute(grid, state)
    
    if r is 1:
        return (1, True)
    elif r is -1:
       return (-1, True)
    else:
        return (reward, False)

def policy(state):
    return np.random.choice(list(actions.keys()))

def move(grid, state, act):
    if attribute(grid, state) is not 0:
        raise Exception('end')

    x, y = actions[act]
    next_state = (state[0] + x, state[1] + y)
    
    if next_state[0] < 0 or next_state[0] >= len(grid[0]) or next_state[1] < 0 or next_state[1] >= len(grid):
        return state

    if attribute(grid, next_state) is 9:
        return state
        
    return next_state

def transit(grid, state, act, p = 0.8):
    return {a: (move(grid, state, a), p if act == a else (1 - p) / (len(actions) - 1)) for a in actions}

def step(grid, state, act, p = 0.8):
    probs = transit(grid, state, act, p)
    
    cs = list(zip(*probs.values()))
    
    next_state = cs[0][np.random.choice(range(len(cs[0])), p = cs[1])]
    
    return (next_state,) + calc_reward(grid, next_state)  


In [None]:
grid1 = [
    [0, 0, 0, 1],
    [0, 9, 0, -1],
    [0, 0, 0, 0]
]

In [None]:
for _ in range(10):
    state = (0, 0)
    total_reward = 0
    done = False
    
    while not done:
        act = policy(state)
        next_state, reward, done = step(grid1, state, act)
        
        total_reward += reward
        state = next_state
        
        if done:
            break
    
    print(f'state: {state}, reward: {total_reward}')
    

In [None]:
def value_plan(grid, gamma = 0.9, threshold = 0.0001, p = 0.8):
    states = [(x, y) for x in range(len(grid[0])) for y in range(len(grid))]
    
    v = {s: 0 for s in states}
    
    calc_value = lambda s, a: sum([
        prob * (calc_reward(grid, next_state)[0] + gamma * v[next_state]) 
        for next_state, prob in transit(grid, s, a, p).values()
    ])
    
    while True:
        delta = 0
        
        for s in v:
            if attribute(grid, s) != 0:
                continue
            
            max_reward = max([calc_value(s, a) for a in actions])

            delta = max(delta, abs(max_reward - v[s]))
            
            v[s] = max_reward
        
        if delta < threshold:
            break
    
    print(v)
    

In [None]:
value_plan(grid1)

In [None]:
def policy_plan(grid, gamma = 0.9, threshold = 0.0001, p = 0.8):
    states = [(x, y) for x in range(len(grid[0])) for y in range(len(grid))]
    
    policies = {s: {a: 1 / len(actions) for a in actions} for s in states}

    calc_value = lambda s, a, v, aprob = 1.0: sum([
        aprob * prob * (calc_reward(grid, next_state)[0] + gamma * v[next_state]) 
        for next_state, prob in transit(grid, s, a, p).values()
    ])

    take_action = lambda ap: max(ap, key = ap.get)
    
    def estimate():
        v = {s: 0 for s in states}
    
        while True:
            delta = 0
        
            for s in v:
                if attribute(grid, s) != 0:
                    continue

                max_reward = max([calc_value(s, a, v, policies[s][a]) for a in actions])

                delta = max(delta, abs(max_reward - v[s]))

                v[s] = max_reward
        
            if delta < threshold:
                break
        return v


    while True:
        unupdated = True
    
        V = estimate()
        
        for s in states:
            if attribute(grid, s) != 0:
                continue

            policy_action = take_action(policies[s])
    
            rewards = {a: calc_value(s, a, V) for a in policies[s]}
        
            best_action = take_action(rewards)
            
            for a in policies[s]:
                policies[s][a] = 1 if a == best_action else 0
            
            if policy_action != best_action:
                unupdated = False
        
        if unupdated:
            break
   
    print(V)

In [None]:
policy_plan(grid1)