In [1]:
import numpy as np
from typing import List
from tqdm import tqdm

In [2]:
class LineWorld:
  def __init__(self):
    self.agent_pos = 2

  # Uniquement pour le MonteCarloES
  def from_random_state() -> 'LineWorld':
    env = LineWorld()
    env.agent_pos = np.random.randint(1, 4)
    return env

  def available_actions(self) -> List[int]:
    if self.agent_pos in [1, 2, 3]:
      return [0, 1] # 0: left, 1: right
    return []

  def is_game_over(self) -> bool:
    return True if self.agent_pos in [0, 4] else False

  def state_id(self) -> int:
    return self.agent_pos

  def step(self, action: int):
    assert(not self.is_game_over())
    assert(action in self.available_actions())

    if action == 0:
      self.agent_pos -= 1
    else:
      self.agent_pos += 1

  def score(self) -> float:
    if self.agent_pos == 0:
      return -1.0
    if self.agent_pos == 4:
      return 1.0
    return 0.0

  def display(self):
    for i in range(5):
      print('X' if self.agent_pos == i else '_', end='')
    print()

  def reset(self):
    self.agent_pos = 2

In [3]:
env = LineWorld()

In [4]:
env.display()

__X__


In [5]:
env.available_actions()

[0, 1]

In [6]:
env.step(0)
env.display()

_X___


In [7]:
env.step(0)
env.display()

X____


In [8]:
env.score()

-1.0

In [9]:
env.reset()
env.display()

__X__


In [10]:
env.step(1)
env.display()

___X_


In [11]:
env.step(1)
env.display()

____X


In [12]:
env.score()

1.0

In [13]:
for _ in range(10):
  env = LineWorld.from_random_state()
  env.display()

___X_
_X___
___X_
_X___
___X_
__X__
___X_
_X___
_X___
__X__


In [59]:
# Monte Carlo ES
def naive_monte_carlo_with_exploring_starts(env_type,
                                            gamma: float = 0.999,
                                            nb_iter: int = 10000,
                                            max_steps: int = 10):
  Pi = {}
  Q = {}
  Returns = {}

  for it in tqdm(range(nb_iter)):
    env = env_type.from_random_state()

    is_first_action = True
    trajectory = []
    steps_count = 0
    while not env.is_game_over() and steps_count < max_steps:
      s = env.state_id()
      aa = env.available_actions()

      if s not in Pi:
        Pi[s] = np.random.choice(aa)

      if is_first_action:
        a = np.random.choice(aa)
        is_first_action = False
      else:
        a = Pi[s]

      prev_score = env.score()
      env.step(a)
      r = env.score() - prev_score

      trajectory.append((s, a, r, aa))
      steps_count += 1

    G = 0
    print(trajectory)
    for (t, (s, a, r, aa)) in reversed(list(enumerate(trajectory))):
      G = gamma * G + r
      print(t)
      print((s, a, r, aa))
      if all(map(lambda triplet: triplet[0] != s or triplet[1] != a, trajectory[:t])):
        if (s, a) not in Returns:
          Returns[(s, a)] = []
        Returns[(s, a)].append(G)
        print(f"action = {a}, state = {s}")
        print(f"goal = {G}")
        print(Returns)
        Q[(s, a)] = np.mean(Returns[(s, a)])

        best_a = None
        best_a_score = 0.0
        for a in aa:
          if (s, a) not in Q:
            Q[(s, a)] = np.random.random()
          if best_a is None or Q[(s, a)] > best_a_score:
            best_a = a
            best_a_score = Q[(s, a)]

        Pi[s] = best_a
    # print(Pi)
    # print(Q)
    # print(Returns)
  return Pi

In [60]:
naive_monte_carlo_with_exploring_starts(LineWorld, nb_iter=10)

100%|██████████| 10/10 [00:00<00:00, 6616.67it/s]

[(2, 1, 0.0, [0, 1]), (3, 1, 1.0, [0, 1])]
1
(3, 1, 1.0, [0, 1])
action = 1, state = 3
goal = 1.0
{(3, 1): [1.0]}
0
(2, 1, 0.0, [0, 1])
action = 1, state = 2
goal = 0.999
{(3, 1): [1.0], (2, 1): [0.999]}
[(1, 1, 0.0, [0, 1]), (2, 1, 0.0, [0, 1]), (3, 1, 1.0, [0, 1])]
2
(3, 1, 1.0, [0, 1])
action = 1, state = 3
goal = 1.0
{(3, 1): [1.0, 1.0], (2, 1): [0.999]}
1
(2, 1, 0.0, [0, 1])
action = 1, state = 2
goal = 0.999
{(3, 1): [1.0, 1.0], (2, 1): [0.999, 0.999]}
0
(1, 1, 0.0, [0, 1])
action = 1, state = 1
goal = 0.998001
{(3, 1): [1.0, 1.0], (2, 1): [0.999, 0.999], (1, 1): [0.998001]}
[(1, 1, 0.0, [0, 1]), (2, 1, 0.0, [0, 1]), (3, 1, 1.0, [0, 1])]
2
(3, 1, 1.0, [0, 1])
action = 1, state = 3
goal = 1.0
{(3, 1): [1.0, 1.0, 1.0], (2, 1): [0.999, 0.999], (1, 1): [0.998001]}
1
(2, 1, 0.0, [0, 1])
action = 1, state = 2
goal = 0.999
{(3, 1): [1.0, 1.0, 1.0], (2, 1): [0.999, 0.999, 0.999], (1, 1): [0.998001]}
0
(1, 1, 0.0, [0, 1])
action = 1, state = 1
goal = 0.998001
{(3, 1): [1.0, 1.0, 1.0], (2,




{2: 1, 3: 1, 1: 1}