<a href="https://colab.research.google.com/github/deguc/Shannon/blob/main/407_TD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

class GridWorld:

    def __init__(self):

        self.H,self.W = 5,7
        self.action_size = 4
        self.grid = np.zeros((self.H,self.W),dtype=np.int8)
        self.grid[1,1:5] = 1
        self.grid[3,2:6] = 1
        self.grid[4,6] = 2
        self.agent_pos = np.array([0,0])
        self.memory = []

    def render(self):

        legend = np.array(['.','#','G'],dtype='>U1')
        view = legend[self.grid]

        for m in self.memory:
            view[*m] = '*'

        view[*self.agent_pos] = 'A'

        for v in view:
            print(' '.join(v))

        print()

    def step(self,state,action):

        move = np.array([[-1,0],[1,0],[0,-1],[0,1]])
        next_state = state + move[action]

        if not(0 <= next_state[0] < self.H and 0 <= next_state[1] <self.W):
            return state,-1.0,False

        cell = self.grid[*next_state]

        if cell == 1:
            return state,-1.0,False

        if cell == 2:
            return next_state,10,True

        return next_state,-0.1,False

    def update(self,action):

        self.memory += [self.agent_pos]
        next_state,reward,done = self.step(self.agent_pos,action)
        self.agent_pos = next_state

        return next_state,reward,done

    def reset(self,state=np.array([0,0])):

        self.agent_pos = state
        self.memory.clear()

        return state

class QLearning:

    def __init__(self,H,W,action_size):

        self.H = H
        self.W = W
        self.action_size = action_size

        self.Q = np.zeros((self.H,self.W,self.action_size))

    def pi(self,state):

        return np.argmax(self.Q[*state])

    def get_action(self,state):

        if np.random.rand() < 0.1:

            return np.random.choice(self.action_size)

        else:
            return self.pi(state)

    def update(self,state,action,reward,next_state,done):

        qmax = np.max(self.Q[*next_state])

        target = reward + (1-int(done))*0.9*qmax
        self.Q[*state,action] += (target-self.Q[*state,action])*0.1

np.set_printoptions(precision=2,suppress=True)

env = GridWorld()
agent = QLearning(env.H,env.W,env.action_size)

for _ in range(1000):
    state = env.reset()

    for _ in range(100):

        action = agent.get_action(state)
        next_state,reward,done = env.update(action)
        agent.update(state,action,reward,next_state,done)

        if done:
            break

        state = next_state

state = env.reset()

for _ in range(20):

    action = agent.pi(state)
    next_state,_,done = env.update(action)

    if done:
        break

    state = next_state

env.render()


* * * * * * .
. # # # # * .
. . . . . * *
. . # # # # *
. . . . . . A

