In [1]:
import numpy as np
from scipy.special import softmax
# import pandas as pd

from src.hunting import Hunting

# Definição do Esimador:

$\hat{T}_m(s,a,k) = \hat{T}_m(s,a,k) + \Delta \hat{T}_m(k) , \; \forall k \in S$ 

$
    \Delta \hat{T}_m(k) = 
        \begin{cases}
            \frac{1- \hat{T}_m(s,a,k) }{N_m(s,a)+1} , & \text{if } k = s'\\
            \\
            \frac{0- \hat{T}_m(s,a,k) }{N_m(s,a)+1}, & \text{if } k \neq s'
        \end{cases} 
$  



$\hat{R}_m(s,a,s') = \hat{R}_m(s,a,s') + \Delta \hat{R}_m $ 

$\Delta \hat{R}_m = \frac{r- \hat{R}_m(s,a) }{N_m(s,a)+1}$  


### Qualidade Instantanea por estimador

$e^T_{m} = 1-2(Z_T \sum_{k \in S}\Delta\hat{T}_m(k)^2)$ 

$e^R_{m} = 1-2(Z_R \Delta\hat{R}_m^2)$ 


$Z_T= \frac{1}{2}(N(s,a)+1)^2$ \
$Z_R= (R_{\max}-R_{\min})^{-1} = (0 - (-1))^{-1} = 1$  


In [125]:
class Estimator:
    def __init__(self, M=1e6, *inputs):
        self.M = M
        self.n = np.zeros(inputs)     
        self.v = np.zeros(inputs) 
        self.e = 0
    def train(self, value=1, *inputs):
        self.n[inputs] = min(self.n[inputs]+1, self.M)
        delta = (value-self.v[inputs])/(np.sum(self.n[inputs[:-1]])+1)
        self.v[inputs] += delta
        norm=1
        self.e = 1-2*norm*delta
    def forward(self, *inputs):
        return self.v[inputs]
    
est = Estimator((5,3))

In [126]:
class Estimator_T(Estimator):    
    def train(self, value=1, *inputs):
        S = self.n.shape[0]
        s,a,s_ = inputs

        self.n[inputs] = min(self.n[inputs]+1, self.M)
        sum_delta = 0
        for k in range(S):
            delta = (
                (value-self.v[s,a,k])/(np.sum(self.n[s,a])+1) 
                if k==s_ else 
                (0-self.v[s,a,k])/(np.sum(self.n[s,a])+1)
            )
            self.v[s,a,k] += delta
            sum_delta += delta**2
        norm = (1/2)*(np.sum(self.n[s,a])+1)**2    
        self.e = 1-2*norm*sum_delta

# Modelo
#### Qualidade Instantanea modelo:  


$E_m = E_m + \rho(e_m - E_m)$

$\rho = 1$

$e_m = c_m(s,a) (\Omega e^R_m + (1-\Omega)e^T_m)$

$c_m(s,a) = \frac{N_m(s,a)}{M}$  
$\Omega = 0$



In [127]:
class Model:
    def __init__(self, S,A, Emin=0.5):
        self.S = S
        self.A = A
        self.M = []
        self.E = [0]
        self.Emin = Emin
        self.new_model()

    def new_model(self, M=1e6):
        self.t = Estimator_T(M, len(self.S), len(self.A) ,len(self.S))
        self.r = Estimator(M, len(self.S), len(self.A) ,len(self.S))
        self.m_cur = len(self.M)
        self.M.append((self.t, self.r))

    def calculate_e(self, s,a,i, omega=0):
        t, r = self.M[i] 
        cm = np.sum(t.n[s,a])/t.M
        return cm*(omega*r.e + (1-omega)*t.e)
    def calculate_E(self, s,a,i, rho=1):
        e = self.calculate_e(s,a,i)
        return self.E[i] + rho*(e-self.E[i])

    def learn(self, s,a,s_,r):
        self.E = [self.calculate_E(s,a,i) for i,_ in enumerate(self.E)]
        self.m_cur = np.argmax(self.E)
        self.t, self.r = self.M[self.m_cur]
        if self.E[self.m_cur] < self.Emin:
            self.new_model()

        self.t.train(1, s,a,s_)
        self.r.train(r, s,a,s_)
    
    def predict(self, s, a): # simulate
        self.E = [self.calculate_E(s,a,i) for i,_ in enumerate(self.E)]
        self.m_cur = np.argmax(self.E)
        self.t, self.r = self.M[self.m_cur]
        s_ = np.random.choice(len(self.S), p=softmax(self.t.forward(s,a)))
        r = self.r.forward(s,a,s_)
        return s_,r
        
    def T(self, s,a,s_): # predict_T
        return self.t.forward(s,a,s_)
    def R(self, s,a,s_):
        return self.r.forward(s,a,s_)

model = Model(range(10), range(4))

# Algorítimo de Controle (RL):
 - Dyna Architecture

In [128]:
class Dyna:
    def __init__(self, model, n=100, alpha=.9, gamma=.9, epsilon=.1):
        self.model = model
        self.n = n
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = np.zeros((self.model.S.size, self.model.A.size))
        self.hist = np.zeros((self.model.S.size, self.model.A.size))

    def run(self,s,a):
        self.hist[s,a] += 1

        Ss = np.random.choice([i for i,v in enumerate(np.sum(self.hist, axis=1)) if v>0], self.n) # n random seen states 
        for s in Ss:
            a = np.random.choice([i for i,v in enumerate(self.hist[s]) if v>0]) # a random taken action in state s
            s_,r = self.model.predict(s,a)
            self.Q[s,a] += self.alpha*(r + self.gamma*np.max(self.Q[s_]) - self.Q[s,a]) 

    def v(self):
        return np.max(self.Q, axis=1)
    def control(self):
        return np.argmax(self.Q, axis=1)

# Agente

In [129]:
class Agent:
    def __init__(self, S, A, n=100, alpha=.9 ,gamma=.9, epsilon=.1):
        self.n = n
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.model = Model(S, A)
        self.rl = Dyna(self.model, self.n, self.alpha, self.gamma, self.epsilon)
    
    def learn(self, s,a,s_,r):
        self.model.learn(s,a,s_,r)
        self.rl.run(s,a)

    def act(self, s):
        pi = self.get_policy()
        return pi[s]
    def evaluate(self, s):
        v = self.get_v()
        return v[s]
    
    def get_v(self):
        return self.rl.v()
    def get_policy(self):
        return self.rl.control()
                

# Simulação

In [130]:
def generate_episode(env,agent, size_limit=100):
    data = []
    env.reset()
    for _ in range(size_limit):
        s, _, _, _, _ = env.last()
        a = agent.act(s)
        s_, r, end, _, _ = env.step(s,a)
        
        step = (s,a,s_,r) 
        agent.learn(*step) 
        data.append(step)
        if end: break
    return data

def experiment(env, agent, max_iterations=1000, episode_sizes=100):
    data = []
    for _ in range(max_iterations):
        epi = generate_episode(env, agent, episode_sizes)
        data.append(len(epi))    
    return data



# Experimentação

In [131]:
env = Hunting(5,5)

n = 100
alpha = 0.9
epsilon = 0.1
gamma = 0.9
num_episodes = 100
episode_size = 12

agent = Agent(env.S, env.A, n, alpha, gamma, epsilon)

In [132]:
exp = experiment(env, agent, num_episodes, episode_size)
# exp

In [136]:
env.plot(list(np.round(agent.get_v(), 2)))
env.plot(list(agent.get_policy()), True)

 _____________________________ 
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
|_____|_____|_____|_____|_____|
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
|_____|_____|_____|_____|_____|
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
|_____|_____|_____|_____|_____|
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
|_____|_____|_____|_____|_____|
| 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
|_____|_____|_____|_____|_____|

 _____________________________ 
|  ↑  |  ↑  |  ↑  |  ↑  |  ↑  |
|_____|_____|_____|_____|_____|
|  ↑  |  ↑  |  ↑  |  ↑  |  ↑  |
|_____|_____|_____|_____|_____|
|  ↑  |  ↑  |  ↑  |  ↑  |  ↑  |
|_____|_____|_____|_____|_____|
|  ↑  |  ↑  |  ↑  |  ↑  |  ↑  |
|_____|_____|_____|_____|_____|
|  ↑  |  ↑  |  ↑  |  ↑  |  ↑  |
|_____|_____|_____|_____|_____|



In [137]:
agent.model.T(0,0,5)

0.0