<a href="https://colab.research.google.com/github/enakai00/rl_book_solutions/blob/master/Chapter05/Monte_Carlo_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from numpy import random
import copy

In [0]:
track_img = '''
############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   #  #
#S  #   #  #
#######G#GG#
'''

In [0]:
def get_track(track_img):
  x_max = max(map(len, track_img.split('\n')))
  track = []
  for line in track_img.split('\n'):
    if line == '':
      continue 
    line += ' ' * x_max
    track.append(list(line)[:x_max])

  return np.array(track)

In [0]:
class Agent:
  def __init__(self, track):
    self.path = []
    self.track = track
    self.actions = [(0,-1), (-1, 0), (1, 0), (0, 1)]
    self.x = 1
    self.y = len(self.track) - 2

  def get_state(self):
    return self.x, self.y

  def get_result(self):
    result = np.copy(self.track)
    for (x, y, a) in self.path:
      result[y][x] = '+'
    return result

  def move(self, a):
    self.path.append((self.x, self.y, a))
    dx, dy = self.actions[a]
    self.x += dx
    self.y += dy
    if self.track[self.y][self.x] == 'G':
      #print ('Well done')
      return True
    if self.track[self.y][self.x] == '#':
      self.x -= dx
      self.y -= dy
    return False

In [0]:
def trial(agent, policy, epsilon=0.2):
  for _ in range(10000):
    x, y = agent.get_state()
    state = "{:02},{:02}".format(x, y)
    if state not in policy.keys():
      policy[state] = random.randint(0, len(agent.actions))

    action = policy[state]
    if random.random() < epsilon:
      action = random.randint(0, len(agent.actions))

    finished = agent.move(action)
    if finished:
      return True
    
  return False

In [0]:
def optimal_action(q, x, y):
  optimal = 0
  q_max = 0
  initial = True
  for a in range(0, 8):
    sa = "{:02},{:02}:{:02}".format(x, y, a)
    if sa not in q.keys():
      q[sa] = -10**10
    if initial or q[sa] > q_max:
      q_max = q[sa]
      optimal = a
      initial = False
  return optimal

def run_sampling(policy_t, track, num=100000, epsilon=0.1, show_path=False):
  policy_b = {}
  q = {}
  c = {}

  for i in range(num):
    agent = Agent(track)
    while not trial(agent, policy_b, epsilon):
      pass

    result = np.copy(agent.track)
    g = 0
    w = 1
    path = agent.path
    path.reverse()
    for x, y, a in path:
      result[y][x] = '+'
      state = "{:02},{:02}".format(x, y)
      sa = "{:02},{:02}:{:02}".format(x, y, a)

      g += -1 # Reward = -1 for each step
      if sa not in c.keys():
        c[sa] = 0
      c[sa] += w
      if sa not in q.keys():
        q[sa] = 0
      q[sa] += w*(g-q[sa])/c[sa]
      policy_t[state] = optimal_action(q, x, y)

      if policy_t[state] != a:
        break
      w = w / (1 - epsilon + epsilon/len(agent.actions))
      # b(a|s) = (1 - epsilon) + epsilon / 4
      # 1 - epsilon : chosen with the greedy policy
      # epsilon / 4 : chosen with the random policy
    
    if show_path:
      for line in result:
        print (''.join(line))
      print ()

    policy_b = copy.copy(policy_t) # Update the behaivor policy

In [7]:
track = get_track(track_img)
policy_t = {}
run_sampling(policy_t, track, num=20, epsilon=0.2, show_path=True)

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   #  #
#S  #+++#  #
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #  +#  #
#S  #  +#  #
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   #  #
#S  #   #+ #
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #   +  #
#   #   +++#
#   # ### +#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   #     +#
#   # ### +#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #   +  #
#   #   +++#
#   # ### +#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   #  ++++#
#   # ### +#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   

In [8]:
run_sampling(policy_t, track, num=20, epsilon=0.2, show_path=True)

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   #  #
#S  #   # +#
#######G#GG#

############
#   ++     #
#    ++++  #
#   # +++  #
#   #   ++ #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   #      #
#   # ###  #
#   #   #  #
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #   +  #
#   #   ++ #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#    ++    #
#   #+++   #
#   #  +++ #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#  ++++    #
#  +# ++   #
#   #  +++ #
#   # ###++#
#   #   # +#
#S  #   # +#
#######G#GG#

############
#          #
#          #
#   #      #
#   

In [9]:
run_sampling(policy_t, track, num=10000, epsilon=0.2, show_path=False)
agent = Agent(track)
trial(agent, policy_t, 0)
result = agent.get_result()
for line in result:
  print (''.join(line))

############
#          #
#  +++     #
#  +#+     #
#+++#+     #
#+  #+###  #
#+  #+++#  #
#+  #  +#  #
#######G#GG#
