In [1]:
import random
import numpy as np

In [20]:
class GridWorld():
    """The size of the grid is (4 x 4)"""
    def __init__(self):
        self.x = 0
        self.y = 0
    
    def step(self, action):
        # action should be an integer between 0 ~ 3
        if type(action) != int:
            raise(TypeError)
        
        if action == 0:
            self.move_right()
        elif action == 1:
            self.move_left()
        elif action == 2:
            self.move_up()
        elif action == 3:
            self.move_down()
        else:
            print(f"action should be an integer between 0 and 3, received {action}")
            raise(ValueError)
            
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_right(self):
        self.y += 1
        if self.y > 3:
            self.y = 3

    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0
            
    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0
    
    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3
    
    def is_done(self):
        if self.x == 3 and self.y == 3:
            return True
        else:
            return False
    
    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

In [21]:
class Agent():
    def __init__(self, update="Monte-Carlo"):
        if update not in ["Monte-Carlo", "Temporal-Difference"]:
            raise(ValueError)
        self.update = update
        self.v_table = np.zeros((4, 4))
        if update == "Monte-Carlo":
            self.gamma = 1.
            self.alpha = 0.0001
        elif update == "Temporal-Difference":
            self.gamma = 1.
            self.alpha = 0.01
    
    def select_action(self):
        return random.randint(0, 3)
    
    def update_table(self, update_info):
        if self.update == "Monte-Carlo":
            history = update_info
            cum_reward = 0.
            for transition in history[::-1]:
                x, y, reward = transition
                self.v_table[x, y] += self.alpha*(cum_reward - self.v_table[x, y])
                cum_reward += self.gamma*reward
                
        elif self.update == "Temporal-Difference":
            transition = update_info
            state, action, reward, next_state = transition
            x, y = state
            next_x, next_y = next_state
            self.v_table[x, y] += self.alpha*(reward+self.gamma*self.v_table[next_x, next_y] - self.v_table[x, y])
            
        else:
            raise(ValueError)
    
    def show_table(self):
        for row in self.v_table:
            print(row)

In [22]:
# Using Monte Carlo
env = GridWorld()
agent = Agent(update="Monte-Carlo")

for _ in range(50000):
    done = False
    history = []
    while not done:
        action = agent.select_action()
        (x, y), reward, done = env.step(action)
        history.append((x, y, reward))
    env.reset()
    
    agent.update_table(history)

agent.show_table()

[-60.00127336 -57.84612229 -54.4565365  -52.06730038]
[-58.06604955 -55.10799589 -49.65119574 -45.0224462 ]
[-54.29151834 -49.73301344 -40.95807082 -29.61830632]
[-51.84281682 -45.1505227  -30.08245869   0.        ]


In [23]:
# Using Temporal Difference
env = GridWorld()
agent = Agent(update="Temporal-Difference")

for _ in range (50000):
    done = False
    state = env.reset()
    while not done:
        action = agent.select_action()
        next_state, reward, done = env.step(action)
        transition = (state, action, reward, next_state)
        agent.update_table(transition)
        state = next_state
    env.reset()
    
agent.show_table()

[-59.13007012 -57.04200392 -54.13116821 -51.93487473]
[-57.2184961  -54.6506139  -49.56895552 -45.50343726]
[-53.72962442 -49.17296891 -40.98221112 -31.93714304]
[-51.94851658 -45.26491244 -29.44788658   0.        ]
