# [Curso de aprendizagem por reforço](https://www.udemy.com/course/inteligencia-artificial-empresas-negocios/): Q-Learning

![caso_1.jpg](caso_1.jpg)

## State Transition Matrix - Markov Chain
|  | A| B| C| D| E| F| G| H| I| J| K| L|
|--|--|--|--|--|--|--|--|--|--|--|--|--|
| A| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|
| B| 1| 0| 1| 0| 0| 1| 0| 0| 0| 0| 0| 0|
| C| 0| 1| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0|
| D| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0|
| E| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 0|
| F| 0| 1| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0|
| G| 0| 0| 1| 0| 0| 0| 0| 1| 0| 0| 0| 0|
| H| 0| 0| 0| 1| 0| 0| 1| 0| 0| 0| 0| 1|
| I| 0| 0| 0| 0| 1| 0| 0| 0| 0| 1| 0| 0|
| J| 0| 0| 0| 0| 0| 1| 0| 0| 1| 0| 1| 0|
| K| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| 1|
| L| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 1| 0|

In [64]:
import numpy as np
from typing import Union
from collections import namedtuple

Space = namedtuple('Space', ['n', 'sample'])


class Env:

    def __init__(self, start_location: str, end_location: str, debug=False):
        assert isinstance(start_location, str), "start_location must be str"
        assert isinstance(end_location, str), "end_location must be str"
        assert isinstance(debug, bool), "debug must be bool"
        self.debug = debug
        self.action_space = Space(n=12, sample=lambda: np.random.choice(self._possible_states()))
        self.state_space = Space(n=12, sample=None)
        self.location_to_state = {'A': 0,
                                  'B': 1,
                                  'C': 2,
                                  'D': 3,
                                  'E': 4,
                                  'F': 5,
                                  'G': 6,
                                  'H': 7,
                                  'I': 8,
                                  'J': 9,
                                  'K': 10,
                                  'L': 11}
        self.state_to_location = {state: location for location, state in self.location_to_state.items()}
        self.start = self.location_to_state[start_location]
        self.end = self.location_to_state[end_location]
        self.original_R = np.array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                    [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
                                    [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
                                    [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                                    [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                                    [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
                                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
                                    [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]])
        self.reset()

    def step(self, action: Union[int, str]):
        assert action in self.location_to_state or action in self.state_to_location,\
            f"Invalid action: {action}"
        if isinstance(action, str):
            action = self.location_to_state[action]

        possible_states = self._possible_states()
        assert action in possible_states, f"action not possible. Must choose: {possible_states}"
        next_state: int = action
        reward = self.R[self.curr_state, action]
        term = action == self.end and self.curr_state == self.end
        self.curr_state = next_state

        if self.debug:
            print("next_state:", next_state, ", reward:", reward, ", term:", term)
            print("possible next_state:", self._states_to_locations(next_state))
        self.curr_state = action
        return next_state, reward, term

    def _possible_states(self):
        possible_states = np.flatnonzero(self.R[self.curr_state,:])
        return possible_states
    
    def _states_to_locations(self, state: int):
        return [self.state_to_location[state] for state in np.flatnonzero(self.R[state,:])]

    def reset(self):
        self.R = np.copy(self.original_R)
        self.R[self.end, self.end] = 100
        self.curr_state = self.start
        return self.curr_state

In [65]:
GAMMA = 0.75
ALPHA = 0.9
REDUCE_EPSILON = 0.99995
EPOCHS = 1000

In [66]:
def action_selection_epsilon_greedy(env: Env, q_table: np.array, state: int, epsilon: float):
    if np.random.rand() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state,:])
    return action

def compute_next_q_value(old_q_value: float, reward: float, optimal_q_value: float):
    return old_q_value + ALPHA * (reward + GAMMA * optimal_q_value - old_q_value)

def reduce_epsilon(epsilon: float):
    return epsilon * REDUCE_EPSILON

In [68]:
def route(start_location: str, end_location: str, debug=False) -> list:
    epsilon = REDUCE_EPSILON
    env = Env(start_location, end_location, debug=False)
    q_table = np.zeros((env.state_space.n, env.action_space.n))
    for epoch in range(EPOCHS):
        state = env.reset()
        term = False
        while not term:
            action = action_selection_epsilon_greedy(env, q_table, state, epsilon)
            next_state, reward, term = env.step(action)
            old_q_value = q_table[state, action]
            next_optimal_q_value = np.max(q_table[next_state,:])
            next_q_value = compute_next_q_value(old_q_value, reward, next_optimal_q_value)
            q_table[state, action] = next_q_value
            state = next_state
        epsilon = reduce_epsilon(epsilon)

    if debug:
        print("Q-Table")
        print(q_table)

    curr_loc = start_location
    path = []
    path.append(curr_loc)
    state = env.reset()
    while curr_loc != end_location:
        action = action_selection_epsilon_greedy(env, q_table, state, epsilon=-1)
        next_state, reward, term = env.step(action)
        curr_loc = env.state_to_location[next_state]
        path.append(curr_loc)
        state = next_state
    return path

def best_route(start_location: str, intermediate_location: str, end_location: str) -> list:
    return route(start_location, intermediate_location)[:-1] + route(intermediate_location, end_location)

print('Route: ')
print(route('A', 'L'))
print(best_route('E', 'K', 'G'))

Route: 
['A', 'B', 'C', 'G', 'H', 'L']
['E', 'I', 'J', 'K', 'L', 'H', 'G']
