<a href="https://colab.research.google.com/github/deguc/Shannon/blob/main/405_DP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import time

class GridWorld:

    def __init__(self):

        self.action_size = 4
        self.H,self.W = 5,7
        self.grid = np.zeros((self.H,self.W),dtype=np.int8)
        self.grid[1,1:4] = 1
        self.grid[3,2:5] = 1
        self.grid[4,6] = 2
        self.start = np.array([0,0])
        self.agent_pos = self.start
        self.memory = [self.agent_pos]

    def render(self):

        legend = np.array(['.','#','G'],dtype='>U1')
        view = legend[self.grid]

        for state  in self.memory:
            view[*state] = '*'
        view[*env.agent_pos] = 'A'

        for v in view:
            print(' '.join(v))

        print()

    def step(self,state,action):

        move = np.array([[-1,0],[1,0],[0,-1],[0,1]])
        next_state = state + move[action]

        if not(0 <= next_state[0] <self.H and 0 <= next_state[1] <self.W):
            return state,-1,False

        cell = self.grid[*next_state]

        if cell == 1:
            return state,-1,False

        if cell == 2:
            return next_state,10,True

        return next_state,-0.1,False

    def update(self,action):

        next_state,reward,done = self.step(self.agent_pos,action)
        self.memory.append(next_state)
        self.agent_pos = next_state

        return next_state,reward,done

    def reset(self,state):

        self.agent_pos = state
        self.memory = [self.agent_pos]

        return self.agent_pos

    def states(self):

        for i in range(self.H):
            for j in range(self.W):
                yield np.array([i,j])

def get_Q(state,V,env):

    Q = np.zeros(env.action_size)

    for action in range(env.action_size):

        next_state,reward,_ = env.step(state,action)
        Q[action] = reward + 0.9 * V[*next_state]

    return Q

def update(V,env):

    for state in env.states():

        if env.grid[*state] == 2:
            V[*state] = 0
            continue

        Q = get_Q(state,V,env)
        V[*state] = np.max(Q)

    return V

def eval(env):

    V = np.zeros((env.H,env.W))

    for i in range(1000):

        old_V = V.copy()
        V = update(V,env)
        delta = np.max(np.abs(old_V-V))

        if delta < 1e-3:
            break

    return V

def greedy(V,env):

    pi = np.zeros((env.H,env.W,env.action_size))

    for state in env.states():
        action = np.argmax(get_Q(state,V,env))
        pi[*state,action] = 1.0

    return pi

np.set_printoptions(precision=2,suppress=True)
env = GridWorld()
V = eval(env)
pi = greedy(V,env)

state = np.array([0,0])
env.reset(state)
for i in range(100):
    env.render()
    time.sleep(0.5)
    action = np.argmax(pi[*state])
    state,_,done = env.update(action)
    if done:
        env.render()
        break




A . . . . . .
. # # # . . .
. . . . . . .
. . # # # . .
. . . . . . G

* . . . . . .
A # # # . . .
. . . . . . .
. . # # # . .
. . . . . . G

* . . . . . .
* # # # . . .
A . . . . . .
. . # # # . .
. . . . . . G

* . . . . . .
* # # # . . .
* . . . . . .
A . # # # . .
. . . . . . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
A . . . . . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* A . . . . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* * A . . . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* * * A . . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* * * * A . G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* * * * * A G

* . . . . . .
* # # # . . .
* . . . . . .
* . # # # . .
* * * * * * A

