## Imports

In [None]:
#importando os módulos básicos
import pandas as pd
import numpy as np

## Helpers

In [None]:
#função auxiliar: a partir da matriz R, a função retorna quais são os possíveis estado a partir de um estado fixado
def get_possible_next_states(R, state):
    possible_next_states = np.argwhere(R[state, :]>=0).reshape(-1,)
    return possible_next_states

In [None]:
class Q_Learning():
    
    def __init__(self, states, actions, R, goal_state, gamma):
        self.states = states
        self.actions = actions
        self.R = R
        self.goal_state = goal_state
        self.gamma = gamma
        Q = np.zeros(len(states) * len(actions)).reshape(len(states), len(actions))
        self.Q = Q
    
    def get_Qmatrix(self):
        Qdf = pd.DataFrame(self.Q, 
                           index=['state_'+str(i) for i in range(self.Q.shape[0])],
                           columns=['action_'+str(i) for i in range(self.Q.shape[1])]).astype(int)
        return Qdf
        
    def run_episode(self):
        state = np.random.choice(self.states, size = 1)[0]
        next_state = None
        while next_state != self.goal_state:
            possible_next_states = get_possible_next_states(R, state)
            next_state = np.random.choice(possible_next_states, size = 1)[0]
            M = self.Q[next_state, get_possible_next_states(R, next_state)].max()
            self.Q[state, next_state] = self.R[state, next_state] + self.gamma * M
            state = next_state
        
    def train_agent(self, num_episodes):
        for e in range(num_episodes):
            self.run_episode()
            
        

## Objetivos

Comece treinando o agente com poucos episódios e vá interpretando os resultados a medida que a quantidade de episódios aumente.

Crie uma função que recebe dois parâmetros: a matriz Q e um estado inicial. Retorne qual é o caminho que o agente sugere, a partir desse estado inicial.

#### Criando o Agente

In [None]:
states = [0,1,2,3,4,5,6,7]
actions = [0,1,2,3,4,5,6,7]
goal_state = 7
gamma = 0.5

In [None]:
R = [[-1,80,5,100,-1,-1,-1,-1],
     [30,-1,-1,-1,80,100,-1,-1],
     [100,-1,-1,-1,-1,-1,-1,-1],
     [-1,-1,30,-1,-1,-1,100,-1],
     [-1,-1,5,-1,-1,100,-1,-1],
     [-1,30,-1,-1,-1,-1,80,100],
     [-1,-1,-1,-1,30,-1,-1,100],
     [-1,-1,-1,-1,-1,-1,-1,100]]

R = np.array(R)
print("reward matrix: matriz de recompensas")
pd.DataFrame(R, 
             index=['state_'+str(i) for i in range(R.shape[0])],
             columns=['state_'+str(i) for i in range(R.shape[0])])

reward matrix: matriz de recompensas


Unnamed: 0,state_0,state_1,state_2,state_3,state_4,state_5,state_6,state_7
state_0,-1,80,5,100,-1,-1,-1,-1
state_1,30,-1,-1,-1,80,100,-1,-1
state_2,100,-1,-1,-1,-1,-1,-1,-1
state_3,-1,-1,30,-1,-1,-1,100,-1
state_4,-1,-1,5,-1,-1,100,-1,-1
state_5,-1,30,-1,-1,-1,-1,80,100
state_6,-1,-1,-1,-1,30,-1,-1,100
state_7,-1,-1,-1,-1,-1,-1,-1,100


In [None]:
# instanciando o agente
# del agent
agent = Q_Learning(
    states= states,
    actions=actions,
    R = R,
    goal_state = goal_state,
    gamma = gamma
)

In [None]:
agent.train_agent(num_episodes = 1)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.  80.  75. 115.   0.   0.   0.   0.] [ 0.  0.  0.  0. 80.  0.  0.  0.]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0        80        75       115         0         0         0   
state_1         0         0         0         0        80         0         0   
state_2       140         0         0         0         0         0         0   
state_3         0         0        30         0         0         0       100   
state_4         0         0        68         0         0       156         0   
state_5         0         0         0         0         0         0       112   
state_6         0         0         0         0        80         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_4         0  
state_5       100  
state_6         0  
state_7         0  


In [None]:
# mais um episódio
agent.train_agent(num_episodes = 1)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.         159.0234375   94.75585938 150.           0.
   0.           0.           0.        ] [109.51171875   0.           0.           0.         158.046875
 156.09375      0.           0.        ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       159        94       150         0         0         0   
state_1       109         0         0         0       158       156         0   
state_2       179         0         0         0         0         0         0   
state_3         0         0       108         0         0         0       100   
state_4         0         0        75         0         0       156         0   
state_5         0        70         0         0         0         0       112   
state_6         0         0         0         0        80         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state

In [None]:
# mais 5 episódios
agent.train_agent(num_episodes = 5)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.         159.0234375   94.9961853  177.01171875   0.
   0.           0.           0.        ] [118.50585938   0.           0.           0.         158.046875
 156.09375      0.           0.        ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       159        94       177         0         0         0   
state_1       118         0         0         0       158       156         0   
state_2       179         0         0         0         0         0         0   
state_3         0         0       119         0         0         0       154   
state_4         0         0        94         0         0       156         0   
state_5         0        70         0         0         0         0       112   
state_6         0         0         0         0       108         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state

In [None]:
# mais alguns episódios
agent.train_agent(num_episodes = 10)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.       178.4375   104.609375 198.4375     0.         0.
   0.         0.      ] [129.21875   0.        0.        0.      176.875   196.875     0.
   0.     ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       178       104       198         0         0         0   
state_1       129         0         0         0       176       196         0   
state_2       199         0         0         0         0         0         0   
state_3         0         0       129         0         0         0       196   
state_4         0         0       104         0         0       196         0   
state_5         0       128         0         0         0         0       176   
state_6         0         0         0         0       128         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_

In [None]:
# mais alguns muitos episódios
agent.train_agent(num_episodes = 1000)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0. 180. 105. 200.   0.   0.   0.   0.] [130.   0.   0.   0. 180. 200.   0.   0.]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       180       105       200         0         0         0   
state_1       130         0         0         0       180       200         0   
state_2       200         0         0         0         0         0         0   
state_3         0         0       130         0         0         0       200   
state_4         0         0       105         0         0       200         0   
state_5         0       130         0         0         0         0       180   
state_6         0         0         0         0       130         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_4         0  
state_5       200  
state_6       200  
state_7       200  


In [None]:
matrix = agent.get_Qmatrix()
matrix

Unnamed: 0,action_0,action_1,action_2,action_3,action_4,action_5,action_6,action_7
state_0,0,159,94,177,0,0,0,0
state_1,118,0,0,0,158,156,0,0
state_2,179,0,0,0,0,0,0,0
state_3,0,0,119,0,0,0,154,0
state_4,0,0,94,0,0,156,0,0
state_5,0,70,0,0,0,0,112,100
state_6,0,0,0,0,108,0,0,150
state_7,0,0,0,0,0,0,0,100


#### Função Melhor Caminho

In [None]:
%%time
def melhor_caminho(Q, estado_inicial):
    L = []
    R = Q
    state = estado_inicial
   
    while True:
      next_state = pd.DataFrame(R).iloc[int(state)].idxmax().split('_')[1]

      if int(next_state) == int(state):
        break

      L.append(next_state)
      state = next_state

    return L

melhor_caminho(matrix, 2)

CPU times: user 2.61 ms, sys: 0 ns, total: 2.61 ms
Wall time: 2.5 ms


['0', '3', '6', '7']

#### Verificando quantos episódios leva para o nosso agente aprender o melhor caminho

In [None]:
# instanciando o agente

del agent

agent = Q_Learning(
    states= states,
    actions=actions,
    R = R,
    goal_state = goal_state,
    gamma = gamma
)

In [None]:
#treinando por 5 apisódios

agent.train_agent(num_episodes = 5)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.    162.5    95.625 178.125   0.      0.      0.      0.   ] [111.25   0.     0.     0.   162.5  165.     0.     0.  ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       162        95       178         0         0         0   
state_1       111         0         0         0       162       165         0   
state_2       189         0         0         0         0         0         0   
state_3         0         0       124         0         0         0       156   
state_4         0         0        91         0         0       165         0   
state_5         0       112         0         0         0         0       130   
state_6         0         0         0         0       112         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_4         0  
state_5       100  
state

In [None]:
# esse código quebra por que o agente ainda não sabe o caminho
melhor_caminho(matrix, 1)


KeyboardInterrupt: ignored

In [None]:
# treinando por mais 5 episódios
agent.train_agent(num_episodes = 5)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.    162.5    95.625 178.125   0.      0.      0.      0.   ] [111.25   0.     0.     0.   162.5  165.     0.     0.  ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       162        95       178         0         0         0   
state_1       111         0         0         0       162       165         0   
state_2       189         0         0         0         0         0         0   
state_3         0         0       124         0         0         0       156   
state_4         0         0        99         0         0       165         0   
state_5         0       112         0         0         0         0       136   
state_6         0         0         0         0       112         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_4         0  
state_5       100  
state

In [None]:
# esse código quebra por que o agente ainda não sabe o caminho
%%time
melhor_caminho(matrix, 1)


KeyboardInterrupt: ignored

In [None]:
# treinando por mais 5 episódios
agent.train_agent(num_episodes = 5)

state = agent.Q[0]
next_state = agent.Q[1]
matrix = agent.get_Qmatrix()

print(state, next_state)
print()
print(matrix)

[  0.     164.0625 101.875  187.5      0.       0.       0.       0.    ] [123.75    0.      0.      0.    167.5   168.125   0.      0.   ]

         action_0  action_1  action_2  action_3  action_4  action_5  action_6  \
state_0         0       164       101       187         0         0         0   
state_1       123         0         0         0       167       168         0   
state_2       193         0         0         0         0         0         0   
state_3         0         0       126         0         0         0       175   
state_4         0         0       101         0         0       175         0   
state_5         0       114         0         0         0         0       136   
state_6         0         0         0         0       117         0         0   
state_7         0         0         0         0         0         0         0   

         action_7  
state_0         0  
state_1         0  
state_2         0  
state_3         0  
state_4         0  
state_5  

In [None]:
%%time
# após 15 episódios ele aprendeu como sair
melhor_caminho(matrix, 1)


CPU times: user 1.11 ms, sys: 0 ns, total: 1.11 ms
Wall time: 1.12 ms


['5', '7']