[Artigo: CDRL](https://drive.google.com/drive/u/0/folders/1NR9Ty7_5lV8Wpn0mNB2PUx1bozJZ-hVm)

In [17]:
import numpy as np
from scipy.special import softmax
# import pandas as pd



# Definição do Esimador:

$\hat{T}_m(s,a,k) = \hat{T}_m(s,a,k) + \Delta \hat{T}_m(k) , \; \forall k \in S$ 

$
    \Delta \hat{T}_m(k) = 
        \begin{cases}
            \frac{1- \hat{T}_m(s,a,k) }{N_m(s,a)+1} , & \text{if } k = s'\\
            \\
            \frac{0- \hat{T}_m(s,a,k) }{N_m(s,a)+1}, & \text{if } k \neq s'
        \end{cases} 
$  



$\hat{R}_m(s,a,s') = \hat{R}_m(s,a,s') + \Delta \hat{R}_m $ 

$\Delta \hat{R}_m = \frac{r- \hat{R}_m(s,a) }{N_m(s,a)+1}$  


### Qualidade Instantanea por estimador

$e^T_{m} = 1-2(Z_T \sum_{k \in S}\Delta\hat{T}_m(k)^2)$ 

$e^R_{m} = 1-2(Z_R \Delta\hat{R}_m^2)$ 


$Z_T= \frac{1}{2}(N(s,a)+1)^2$ \
$Z_R= (R_{\max}-R_{\min})^{-1} = (0 - (-1))^{-1} = 1$  


In [18]:
class Estimator:
    def __init__(self, M=1e6, *inputs):
        self.M = M
        self.n = np.zeros(inputs)     
        self.v = np.zeros(inputs) 
        self.e = 0
    def train(self, value=1, *inputs):
        self.n[inputs] = min(self.n[inputs]+1, self.M)
        delta = (value-self.v[inputs])/(np.sum(self.n[inputs[:-1]])+1)
        self.v[inputs] += delta
        norm=1
        self.e = 1-2*norm*delta
    def forward(self, *inputs):
        return self.v[inputs]
    
est = Estimator((5,3))

In [19]:
class Estimator_T(Estimator):    
    def train(self, value=1, *inputs):
        S = self.n.shape[0]
        s,a,s_ = inputs

        self.n[inputs] = min(self.n[inputs]+1, self.M)
        sum_delta = 0
        for k in range(S):
            delta = (
                (value-self.v[s,a,k])/(np.sum(self.n[s,a])+1) 
                if k==s_ else 
                (0-self.v[s,a,k])/(np.sum(self.n[s,a])+1)
            )
            self.v[s,a,k] += delta
            sum_delta += delta**2
        norm = (1/2)*(np.sum(self.n[s,a])+1)**2    
        self.e = 1-2*norm*sum_delta

# Modelo
#### Qualidade Instantanea modelo:  


$E_m = E_m + \rho(e_m - E_m)$

$\rho = 1$

$e_m = c_m(s,a) (\Omega e^R_m + (1-\Omega)e^T_m)$

$c_m(s,a) = \frac{N_m(s,a)}{M}$  
$\Omega = 0$



In [20]:
class Model:
    def __init__(self, S,A, Emin=0.01, Omega=1, M=1e2):
        self.S = S
        self.A = A
        self.Models = []
        self.E = []
        self.Emin = Emin
        self.Omega = Omega
        self.M = M
        self.new_model()

    def new_model(self, M=1e2):
        self.t = Estimator_T(self.M, len(self.S), len(self.A) ,len(self.S))
        self.r = Estimator(self.M, len(self.S), len(self.A) ,len(self.S))
        self.m_cur = len(self.Models)
        self.Models.append((self.t, self.r))
        self.E.append(0)

    def calculate_e(self, s,a,i):
        t, r = self.Models[i] 
        cm = np.sum(t.n[s,a])/t.M
        return cm*(self.Omega*r.e + (1-self.Omega)*t.e)
    def calculate_E(self, s,a,i, rho=1):
        e = self.calculate_e(s,a,i)
        return self.E[i] + rho*(e-self.E[i])

    def learn(self, s,a,s_,r, log=False):
        self.E = [self.calculate_E(s,a,i) for i,_ in enumerate(self.Models)]
        self.m_cur = np.argmax(self.E)
        self.t, self.r = self.Models[self.m_cur]
        if log:
            print('E: ', self.E, self.E[self.m_cur] < self.Emin)
        if self.E[self.m_cur] < self.Emin:
            self.new_model()

        self.t.train(1, s,a,s_)
        self.r.train(r, s,a,s_)
    
    def predict(self, s, a): # simulate
        self.E = [self.calculate_E(s,a,i) for i,_ in enumerate(self.E)]
        self.m_cur = np.argmax(self.E)
        self.t, self.r = self.Models[self.m_cur]
        s_ = np.random.choice(len(self.S), p=softmax(self.t.forward(s,a)))
        r = self.r.forward(s,a,s_)
        return s_,r
        
    def T(self, s,a,s_): # predict_T
        return self.t.forward(s,a,s_)
    def R(self, s,a,s_):
        return self.r.forward(s,a,s_)

model = Model(range(10), range(4))

# Simulação

In [35]:
d1 = np.random.choice(2, 10, p=[.2, .8])
print(d1)

d2 = np.random.choice(2, 10, p=[.5, .5])
print(d2)

d = np.concatenate([d1,d2])
print(d)

[1 1 1 0 1 0 1 1 0 1]
[1 0 0 0 0 1 0 0 0 1]
[1 1 1 0 1 0 1 1 0 1 1 0 0 0 0 1 0 0 0 1]


In [21]:
c1 = .7
n = 100
data = np.random.choice(2, 100, p=[1-c1, c1])

print(data.sum()/data.size)
data

0.74


array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0])

In [36]:
est = Estimator(1e6, 1)
for p in data:
    est.train(p, 0)
    # print(est.e)

est.forward(0)

0.7326732673267325

In [37]:
agent = Model([0], [0])
for p in data:
    agent.learn(0,0,0,p, log=True)

agent.predict(0,0)[1]

E:  [0.0] True
E:  [0.0, 0.0] True
E:  [0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01] False
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] True
E:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006666666666666668, 

0.6941176470588231