In [1]:
import random
from itertools import product
import numpy as np

In [2]:
class GridWorld():
    """The size of the grid is (5 x 7)"""
    """0 0 1 0 0 0 0"""
    """0 0 1 0 0 0 0"""
    """S 0 1 0 1 0 0"""
    """0 0 0 0 1 0 0"""
    """0 0 0 0 1 0 E"""
    def __init__(self):
        self.x = 2
        self.y = 0
    
    def step(self, action):
        if type(action) != int:
            raise(TypeError)
        
        if action == 0:
            self.move_left()
        elif action == 1:
            self.move_right()
        elif action == 2:
            self.move_up()
        elif action == 3:
            self.move_down()
        else:
            raise(ValueError)
        
        reward = -1
        done = self.is_done()
        return (self.x, self.y), reward, done
    
    def move_left(self):
        if self.y == 0:
            pass
        elif self.y == 3 and self.x in [0, 1, 2]:
            pass
        elif self.y == 5 and self.x in [2, 3, 4]:
            pass
        else:
            self.y -= 1
            
    def move_right(self):
        if self.y == 1 and self.x in [0, 1, 2]:
            pass
        elif self.y == 3 and self.x in [2, 3, 4]:
            pass
        elif self.y == 6:
            pass
        else:
            self.y += 1
            
    def move_up(self):
        if self.x == 0:
            pass
        elif self.x == 3 and self.y == 2:
            pass
        else:
            self.x -= 1
    
    def move_down(self):
        if self.x == 4:
            pass
        elif self.x == 1 and self.y == 4:
            pass
        else:
            self.x += 1
            
    def is_done(self):
        if self.x == 4 and self.y == 6:
            return True
        else:
            return False
        
    def reset(self):
        self.x = 2
        self.y = 0
        return (self.x, self.y)

In [22]:
class QAgent():
    def __init__(self, update="Q-learning"):
        if update not in ["Monte-Carlo", "SARSA", "Q-learning"]:
            raise(ValueError)
        self.update = update
        self.q_table = np.zeros((5, 7, 4))
        self.epsilon = 0.9
        if update == "Monte-Carlo":
            self.alpha = 0.01
        else:
            self.alpha = 0.1
        
    def select_action(self, state):
        x, y = state
        if random.random() < self.epsilon:
            action = random.randint(0, 3)
        else:
            action_val = self.q_table[x, y, :]
            action = int(np.argmax(action_val))
        return action
    
    def update_table(self, update_info):
        if self.update == "Monte-Carlo":
            history = update_info
            cum_reward = 0
            for transition in history[::-1]:
                state, action, reward, next_state = transition
                x, y = state
                self.q_table[x, y, action] += self.alpha*(cum_reward - self.q_table[x, y, action])
                cum_reward += reward
                
        elif self.update == "SARSA":
            transition = update_info
            state, action, reward, next_state = transition
            x, y = state
            next_x, next_y = next_state
            next_action = self.select_action(next_state)
            self.q_table[x, y, action] += self.alpha*(reward + self.q_table[next_x, next_y, next_action] - self.q_table[x, y, action])
            
        elif self.update == "Q-learning":
            transition = update_info
            state, action, reward, next_state = transition
            x, y = state
            next_x, next_y = next_state
            self.q_table[x, y, action] += self.alpha*(reward + np.amax(self.q_table[next_x, next_y, :]) - self.q_table[x, y, action])
        
        else:
            raise(ValueError)
    
    def anneal_epsilon(self):
        if self.update in ["Monte-Carlo", "SARSA"]:
            self.epsilon -= 0.03
            self.epsilon = max(self.epsilon, 0.1)
        else:       # Q-learning
            self.epsilon -= 0.01
            self.epsilon = max(self.epsilon, 0.2)
        
    def show_table(self):
        data = np.argmax(self.q_table, axis=2).tolist()
        action_dict = {0: "<", 1: ">", 2: "^", 3: "v"}
        for row, column in product(range(5), range(7)):
            if row == 4 and column == 6:
                data[row][column] = "E"
            elif row in [0, 1, 2] and column == 2:
                data[row][column] = "x"
            elif row in [2, 3, 4] and column == 4:
                data[row][column] = "x"
            else:
                data[row][column] = action_dict[data[row][column]]
        
        for row, values in enumerate(data):
            if row == 2:
                print(f"start -> {values}")
            else:
                print(f"         {values}")

In [23]:
# Monte-Carlo
env = GridWorld()
agent = QAgent(update="Monte-Carlo")

for n_epi in range(10000):
    done = False
    history = []
    
    state = env.reset()
    while not done:
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)
        history.append((state, action, reward, next_state))
        state = next_state
    agent.update_table(history)
    agent.anneal_epsilon()

agent.show_table()

         ['v', 'v', 'x', 'v', '<', '<', 'v']
         ['v', '<', 'x', '>', '>', 'v', 'v']
start -> ['v', 'v', 'x', '^', 'x', 'v', 'v']
         ['>', '>', '>', '^', 'x', '>', 'v']
         ['>', '^', '>', '^', 'x', '>', 'E']


In [24]:
# SARSA
env = GridWorld()
agent = QAgent(update="SARSA")

for n_epi in range(1000):
    done = False
    state = env.reset()
    while not done:
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)
        agent.update_table((state, action, reward, next_state))
        state = next_state
    agent.anneal_epsilon()

agent.show_table() 

         ['^', '<', 'x', '>', '^', '>', 'v']
         ['^', '<', 'x', '>', '>', '>', 'v']
start -> ['>', 'v', 'x', '^', 'x', '>', 'v']
         ['>', '>', '>', '^', 'x', '>', 'v']
         ['v', '>', 'v', '^', 'x', '>', 'E']


In [25]:
# Q-learning
env = GridWorld()
agent = QAgent(update="Q-learning")

for n_epi in range(1000):
    done = False
    state = env.reset()
    while not done:
        action = agent.select_action(state)
        next_state, reward, done = env.step(action)
        agent.update_table((state, action, reward, next_state))
        state = next_state
    agent.anneal_epsilon()

agent.show_table() 

         ['>', 'v', 'x', 'v', '>', '>', 'v']
         ['v', 'v', 'x', '>', '>', '>', 'v']
start -> ['>', 'v', 'x', '^', 'x', 'v', 'v']
         ['>', '>', '>', '^', 'x', 'v', 'v']
         ['<', '^', '>', '^', 'x', '>', 'E']
