# Monte Carlo
In Dynamic Programming algorithms we had a full model of the problem, but usually it's not the case in real life problems. In DP we didn't play the game.<br>
In monte carlo the agent will learn purely from experience. We perform the update at the end of the episode, so it's not fully online.<br>
In monte carlo we play the game several times and we log the sequence of visited states and rewards. Then we compute the expected returns as the sum of the future rewards (eventually discounted). Finally, when we have several returns for each state, gathered from different runs of the game, we compute the sample mean of the returns to estimate the expected return in each state.<br>
If we visit the same state more than once, we ha two options:
- First-visit Monte Carlo: where we only consider the first time we see the state
- Every-visit Monte Carlo: where we consider the rewards of all the repeated visits in one state

It's proven that both methods brings to the same solution.<br>
We can also compute the sample mean from the previous mean, without the need to keep a log of all the past returns. For non-stationary problems we could also use a moving average.<br>
Monte Carlo is better than DP when the state space is too big to be completely explored, in fact we use only the states that have been visited by the agent during the simulations.

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import tqdm
from grid_world import negative_grid, standard_grid
plt.rcParams['figure.figsize'] = (15,7)

In [2]:
THRESHOLD = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

def print_values(V, g):
    for i in range(g.width):
        print('-----------------------')
        for j in range(g.height):
            v = V.get((i, j), 0)
            if v >= 0:
                print(' {0:.2f}'.format(v), end = ' ')
            else:
                print('{0:.2f}'.format(v), end = ' ')
        print()
    print('-----------------------')

def print_policy(P, g):
    for i in range(g.width):
        print('---------------')
        for j in range(g.height):
            p = P.get((i, j), ' ')
            print(' ' + p + ' ', end = ' ')
        print()
    print('---------------')

# Policy Evaluation - Monte Carlo
Exploring start method: we start from a random state in roder to try to visit all the states

In [3]:
def play_game(grid, policy):
    ## Exploring start method
    start_states = list(grid.actions.keys())
    start = start_states[np.random.choice(len(start_states))]
    grid.set_state(start)
    states_and_rewards = [(start, 0)]
    s = start
    while not grid.game_over():
        a = policy[s]
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    
    G = 0
    states_and_returns = []
    first = True
    for s, r in states_and_rewards[::-1]:
        if not first:
            states_and_returns.append((s, G))
        else:
            first = False
        G = r + GAMMA * G
    return states_and_returns

In [4]:
grid = standard_grid()

print('Rewards:')
print_values(grid.rewards, grid)

states = grid.all_states()
V = {s: 0 for s in states}
returns = {s: [] for s in states}
policy = {(2,0): 'U',
         (1,0): 'U',
         (0,0): 'R',
         (0,1): 'R',
         (0,2): 'R',
         (1,2): 'R',
         (2,1): 'R',
         (2,2): 'R',
         (2,3): 'U'}
print('Policy:')
print_policy(policy, grid)
grid.policy = policy
print()

for i in tqdm.tqdm(range(100)):
    states_and_returns = play_game(grid, policy)
    seen_states = set()
    for s, g in states_and_returns:
        if not s in seen_states:
            returns[s].append(g)
            V[s] = np.mean(returns[s])
        seen_states.add(s)

print('Value:')
print_values(V, grid)

Rewards:
-----------------------
 0.00  0.00  0.00  1.00 
-----------------------
 0.00  0.00  0.00 -1.00 
-----------------------
 0.00  0.00  0.00  0.00 
-----------------------
Policy:
---------------
 R   R   R      
---------------
 U       R      
---------------
 U   R   R   U  
---------------



100%|██████████████████████████████████████| 100/100 [00:00<00:00, 4163.95it/s]


Value:
-----------------------
 0.81  0.90  1.00  0.00 
-----------------------
 0.73  0.00 -1.00  0.00 
-----------------------
 0.66 -0.81 -0.90 -1.00 
-----------------------


# Windy Gridworld
Here the state transitions are not deterministic, so Monte Carlo can help. The policy will guide the agent to the winning state.

In [5]:
def windy_action(a):
    p = np.random.rand()
    if p < 0.5:
        return a
    else:
        l = list(ALL_POSSIBLE_ACTIONS)
        l.remove(a)
        return np.random.choice(l)
    
def play_game(grid, policy):
    ## Exploring start method
    start_states = list(grid.actions.keys())
    start = start_states[np.random.choice(len(start_states))]
    grid.set_state(start)
    states_and_rewards = [(start, 0)]
    s = start
    while not grid.game_over():
        a = windy_action(policy[s])
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    
    G = 0
    states_and_returns = []
    first = True
    for s, r in states_and_rewards[::-1]:
        if not first:
            states_and_returns.append((s, G))
        else:
            first = False
        G = r + GAMMA * G
    return states_and_returns


grid = standard_grid()
print('Rewards:')
print_values(grid.rewards, grid)

states = grid.all_states()
V = {s: 0 for s in states}
returns = {s: [] for s in states}
policy = {(2,0): 'U',
         (1,0): 'U',
         (0,0): 'R',
         (0,1): 'R',
         (0,2): 'R',
         (1,2): 'U',
         (2,1): 'R',
         (2,2): 'U',
         (2,3): 'L'}
print('Random policy:')
print_policy(policy, grid)
grid.policy = policy
print()

for i in tqdm.tqdm(range(5000)):
    states_and_returns = play_game(grid, policy)
    seen_states = set()
    for s, g in states_and_returns:
        if not s in seen_states:
            returns[s].append(g)
            V[s] = np.mean(returns[s])
        seen_states.add(s)

print('Value:')
print_values(V, grid)

Rewards:
-----------------------
 0.00  0.00  0.00  1.00 
-----------------------
 0.00  0.00  0.00 -1.00 
-----------------------
 0.00  0.00  0.00  0.00 
-----------------------
Random policy:
---------------
 R   R   R      
---------------
 U       U      
---------------
 U   R   U   L  
---------------



100%|████████████████████████████████████| 5000/5000 [00:02<00:00, 2177.80it/s]


Value:
-----------------------
 0.58  0.73  0.86  0.00 
-----------------------
 0.43  0.00  0.25  0.00 
-----------------------
 0.29  0.15  0.13 -0.19 
-----------------------


# Control problem - Find the optimal policy
When we are in a state s, in order to choose the best action we should know the V of the next states and pick the max. We can do this by using the Q(s, a) function.<br>
By using Q instead of V, we have |S|x|A| values to aproximate instead of |S|, so we need more runs of the game.

In [16]:
def opposite_actions(a1, a2):
    comb = [('U', 'D'), ('D', 'U'), ('R', 'L'), ('L', 'R')]
    for c in comb:
        if a1 == c[0] and a2 == c[1]:
            return True

def play_game(grid, policy):
    ## Exploring start method
    start_states = list(grid.actions.keys())
    start = start_states[np.random.choice(len(start_states))]
    a = np.random.choice(grid.actions[start])
    grid.set_state(start)
    states_and_rewards = [(start, a, 0)]
    s = start
    while True:
        old_s = grid.current_state()
        r = grid.move(a)
        s = grid.current_state()
        if old_s == s:
            states_and_rewards.append((s, None, -100))
            break
        elif grid.game_over():
            states_and_rewards.append((s, None, r))
            break
        elif opposite_actions(policy[s], a):
            l = list(grid.actions[s])
            l.remove(policy[s])
            a = np.random.choice(l)
            states_and_rewards.append((s, a, r))
        else:
            a = policy[s]
            states_and_rewards.append((s, a, r))
    
    G = 0
    states_and_returns = []
    first = True
    for s, a, r in states_and_rewards[::-1]:
        if not first:
            states_and_returns.append((s, a, G))
        else:
            first = False
        G = r + GAMMA * G
    return states_and_returns[::-1]


grid = standard_grid()
print('Rewards:')
print_values(grid.rewards, grid)

states = grid.all_states()
Q = {(s, a): 0 for s in states for a in ALL_POSSIBLE_ACTIONS}
returns = {(s, a): [] for s in states for a in ALL_POSSIBLE_ACTIONS}
policy = {(2,0): 'R',
         (1,0): 'D',
         (0,0): 'R',
         (0,1): 'R',
         (0,2): 'R',
         (1,2): 'D',
         (2,1): 'R',
         (2,2): 'R',
         (2,3): 'U'}
print('Random policy:')
print_policy(policy, grid)
grid.policy = policy
print()

for i in tqdm.tqdm(range(5000)):
    # policy evaluation
    states_and_returns = play_game(grid, policy)
    seen_states_actions = set()
    for s, a, g in states_and_returns:
        if not (s, a) in seen_states_actions:
            returns[(s, a)].append(g)
            Q[(s, a)] = np.mean(returns[(s, a)])
        seen_states_actions.add((s, a))
    
    # policy improvement
    for s in policy.keys():
        max_a = policy[s]
        for a in grid.actions[s]:
            if Q[(s, a)] > Q[(s, max_a)]:
                max_a = a
        policy[s] = max_a

print('Policy')
print_policy(policy, grid)

V = {s: 0 for s, a in Q}
for s, a in Q:
    v = Q[(s, a)]
    V[s] = v if v > V[s] else V[s]
    
print('\nValue function:')
print_values(V, grid)

Rewards:
-----------------------
 0.00  0.00  0.00  1.00 
-----------------------
 0.00  0.00  0.00 -1.00 
-----------------------
 0.00  0.00  0.00  0.00 
-----------------------
Random policy:
---------------
 R   R   R      
---------------
 D       D      
---------------
 R   R   R   U  
---------------



100%|████████████████████████████████████| 5000/5000 [00:01<00:00, 2722.90it/s]


Policy
---------------
 R   R   R      
---------------
 U       U      
---------------
 U   R   U   L  
---------------

Value function:
-----------------------
 0.81  0.90  1.00  0.00 
-----------------------
 0.73  0.00  0.90  0.00 
-----------------------
 0.66  0.73  0.81  0.73 
-----------------------


# Monte Carlo control Epsilon Greedy

In [19]:
def random_action(a, eps=0.1):
    '''p = [eps / 3 for i in ALL_POSSIBLE_ACTIONS]
    p[ALL_POSSIBLE_ACTIONS.index(a)] = 1 - eps
    return np.random.choice(ALL_POSSIBLE_ACTIONS, p=p)'''
    p = np.random.rand()
    if p < 1 - eps:
        return a
    else:
        return np.random.choice(ALL_POSSIBLE_ACTIONS)

def opposite_actions(a1, a2):
    comb = [('U', 'D'), ('D', 'U'), ('R', 'L'), ('L', 'R')]
    for c in comb:
        if a1 == c[0] and a2 == c[1]:
            return True

def play_game(grid, policy):
    start = (2,0)
    a = random_action(policy[start])
    grid.set_state(start)
    states_and_rewards = [(start, a, 0)]
    s = start
    while True:
        old_s = grid.current_state()
        r = grid.move(a)
        s = grid.current_state()
        '''if old_s == s:
            states_and_rewards.append((s, None, -100))
            break'''
        if grid.game_over():
            states_and_rewards.append((s, None, r))
            break
        else:
            a = random_action(policy[s])
            states_and_rewards.append((s, a, r))
    
    G = 0
    states_and_returns = []
    first = True
    for s, a, r in states_and_rewards[::-1]:
        if not first:
            states_and_returns.append((s, a, G))
        else:
            first = False
        G = r + GAMMA * G
    return states_and_returns[::-1]


grid = negative_grid(step_cost=-0.1)
print('Rewards:')
print_values(grid.rewards, grid)

states = grid.all_states()
Q = {(s, a): 0 for s in states for a in ALL_POSSIBLE_ACTIONS}
returns = {(s, a): [] for s in states for a in ALL_POSSIBLE_ACTIONS}
policy = {(2,0): 'R',
         (1,0): 'D',
         (0,0): 'R',
         (0,1): 'R',
         (0,2): 'R',
         (1,2): 'D',
         (2,1): 'R',
         (2,2): 'R',
         (2,3): 'U'}
print('Random policy:')
print_policy(policy, grid)
grid.policy = policy
print()

for i in tqdm.tqdm(range(10000)):
    # policy evaluation
    states_and_returns = play_game(grid, policy)
    seen_states_actions = set()
    for s, a, g in states_and_returns:
        if not (s, a) in seen_states_actions:
            returns[(s, a)].append(g)
            Q[(s, a)] = np.mean(returns[(s, a)])
        seen_states_actions.add((s, a))
    
    # policy improvement
    for s in policy.keys():
        max_a = policy[s]
        for a in grid.actions[s]:
            if Q[(s, a)] > Q[(s, max_a)]:
                max_a = a
        policy[s] = max_a

print('Policy')
print_policy(policy, grid)

V = {s: 0 for s, a in Q}
for s, a in Q:
    v = Q[(s, a)]
    V[s] = v if v > V[s] else V[s]
    
print('\nValue function:')
print_values(V, grid)

Rewards:
-----------------------
-0.10 -0.10 -0.10  1.00 
-----------------------
-0.10  0.00 -0.10 -1.00 
-----------------------
-0.10 -0.10 -0.10 -0.10 
-----------------------
Random policy:
---------------
 R   R   R      
---------------
 D       D      
---------------
 R   R   R   U  
---------------



100%|███████████████████████████████████| 10000/10000 [00:10<00:00, 991.38it/s]


Policy
---------------
 R   R   R      
---------------
 U       U      
---------------
 U   R   U   L  
---------------

Value function:
-----------------------
 0.58  0.78  1.00  0.00 
-----------------------
 0.41  0.00  0.78  0.00 
-----------------------
 0.25  0.35  0.54  0.32 
-----------------------
