**[MQS-01]**

Import modules.

In [1]:
import numpy as np
import copy
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.size'] = 12

**[MQS-02]**

Define a function to get the maze data in a list.

In [2]:
def get_maze():
  maze_img = '''
#----------#
#S        G#
#   ####   #
#   ####   #
#          #
#          #
#          #
############
'''
  maze = []
  for line in maze_img.split('\n'):
    if line == '':
      continue 
    maze.append(list(line))

  return maze

**[MQS-03]**

Define the Agent class to walk through the maze.

In [3]:
class Agent:
  def __init__(self, maze):
    self.maze = maze
    size_y, size_x = len(maze), len(maze[0])
    self.states = [(x, y) for x in range(size_x) for y in range(size_y)]
    self.actions = [(0, -1), (-1, 0), (1, 0), (0, 1)]

    self.policy = {}
    for s in self.states:
      self.policy[s] = self.actions[np.random.randint(len(self.actions))]

    self.q = {}
    for s in self.states:
      for a in self.actions:
        self.q[(s, a)] = 0

  def move(self, s, a):
    x, y = s
    dx, dy = a

    if self.maze[y][x] == 'G':
      return 0, s         # Reward, Next state
 
    if self.maze[y+dy][x+dx] != '#':
      x += dx
      y += dy

    if self.maze[y][x] == '-':
      return -100, (1, 1) # Reward, Next state

    return -1, (x, y)     # Reward, Next state

**[MQS-04]**

Define a function to update the action policy for a specific state.

In [4]:
def policy_update(agent, s):
  q_max = -10**10
  a_best = None
  for a in agent.actions:
    if agent.q[(s, a)] > q_max:
      q_max = agent.q[(s, a)]
      a_best = a

  agent.policy[s] = a_best

**[MQS-05]**

Define a function to apply the Q-Leanring algorithm for a single episode.

In [5]:
def get_episode_ql(agent, epsilon, train):
  episode = []
  s = (1, 1)  # Start
  while True:
    if np.random.random() < epsilon:
      a = agent.actions[np.random.randint(len(agent.actions))]
    else:
      a = agent.policy[s]

    r, s_new = agent.move(s, a)
    episode.append((s, a, r))

    if train:
      agent.q[(s, a)] += 0.2 * (r + agent.q[(s_new, agent.policy[s_new])] - agent.q[(s, a)])
      policy_update(agent, s)

    x, y = s_new
    if agent.maze[y][x] == 'G':
      break
    s = s_new

  return episode

**[MQS-06]**

Define a function to apply the SARSA algorithm for a single episode.

In [6]:
def get_episode_salsa(agent, epsilon, train):
  episode = []
  s = (1, 1)  # Start
  if np.random.random() < epsilon:
    a = agent.actions[np.random.randint(len(agent.actions))]
  else:
    a = agent.policy[s]

  while True:
    r, s_new = agent.move(s, a)
    episode.append((s, a, r))

    if np.random.random() < epsilon:
      a_new = agent.actions[np.random.randint(len(agent.actions))]
    else:
      a_new = agent.policy[s_new]

    if train:
      agent.q[(s, a)] += 0.2 * (r + agent.q[(s_new, a_new)] - agent.q[(s, a)])
      policy_update(agent, s)

    x, y = s_new
    if agent.maze[y][x] == 'G':
      break
    a = a_new
    s = s_new

  return episode

**[MQS-07]**

Define a function to train the agent with the Q-Learning algorithm.

In [7]:
def train_ql(agent, epsilon, num):
  episode_lengths = []

  for _ in range(num):
    episode = get_episode_ql(agent, epsilon, train=True)
    episode_lengths.append(len(episode))

  return episode_lengths

**[MQS-08]**

Define a function to train the agent with the SARSA algorithm.

In [8]:
def train_salsa(agent, epsilon, num):
  episode_lengths = []

  for _ in range(num):
    episode = get_episode_salsa(agent, epsilon, train=True)
    episode_lengths.append(len(episode))

  return episode_lengths

**[MQS-09]**

Apply the Q-Learning algorithm to train the agent, and show the result.

In [9]:
maze = get_maze()
agent = Agent(maze)
episode_lengths = train_ql(agent, epsilon=0.1, num=1000)

episode = get_episode_ql(agent, epsilon=0, train=False)
result = np.copy(agent.maze)
for (s, a, r) in episode:
  x, y = s
  result[y][x] = '+'
for line in result:
  print (''.join(line))

#----------#
#+++++++++G#
#   ####   #
#   ####   #
#          #
#          #
#          #
############


**[MQS-10]**

Apply the SARSA algorithm to train the agent, and show the result.

In [10]:
maze = get_maze()
agent = Agent(maze)
episode_lengths = train_salsa(agent, epsilon=0.1, num=1000)

episode = get_episode_salsa(agent, epsilon=0, train=False)
result = np.copy(agent.maze)
for (s, a, r) in episode:
  x, y = s
  result[y][x] = '+'
for line in result:
  print (''.join(line))

#----------#
#+        G#
#+  ####  +#
#+  ####+++#
#++++++++  #
#          #
#          #
############
