[Artigo: CDRL](https://drive.google.com/drive/u/0/folders/1NR9Ty7_5lV8Wpn0mNB2PUx1bozJZ-hVm)

In [2]:
import numpy as np
from scipy.special import softmax
# import pandas as pd

# Definição do Esimador:

$\hat{T}_m(s,a,k) = \hat{T}_m(s,a,k) + \Delta \hat{T}_m(k) , \; \forall k \in S$ 

$
    \Delta \hat{T}_m(k) = 
        \begin{cases}
            \frac{1- \hat{T}_m(s,a,k) }{N_m(s,a)+1} , & \text{if } k = s'\\
            \\
            \frac{0- \hat{T}_m(s,a,k) }{N_m(s,a)+1}, & \text{if } k \neq s'
        \end{cases} 
$  



$\hat{R}_m(s,a,s') = \hat{R}_m(s,a,s') + \Delta \hat{R}_m $ 

$\Delta \hat{R}_m = \frac{r- \hat{R}_m(s,a) }{N_m(s,a)+1}$  


### Qualidade Instantanea por estimador

$e^T_{m} = 1-2(Z_T \sum_{k \in S}\Delta\hat{T}_m(k)^2)$ 

$e^R_{m} = 1-2(Z_R \Delta\hat{R}_m^2)$ 


$Z_T= \frac{1}{2}(N(s,a)+1)^2$ \
$Z_R= (R_{\max}-R_{\min})^{-1} = (0 - (-1))^{-1} = 1$  


In [3]:
class Estimator:
    def __init__(self, M=1e6, *inputs):
        self.M = M
        self.n = np.zeros(inputs)     
        self.v = np.zeros(inputs) 
        # self.e = 0

    def e(self, value=1, *inputs):
        return 1-2 * (self.z(*inputs) * self.delta(value, *inputs)**2)

    def z(self, *inputs):
        return 1

    def delta(self, value, *inputs):
        return (value-self.v[inputs]) / (np.sum(self.n[inputs[:-1]])+1)

    def train(self, value=1, *inputs):
        self.n[inputs] = min(self.n[inputs]+1, self.M)
        delta = self.delta(value, *inputs)
        self.v[inputs] += delta
        
    def forward(self, *inputs):
        return self.v[inputs]
    
    
    
est = Estimator((5,3))

In [4]:
class Estimator_T(Estimator):    
    def e(self, value=1, *inputs):
        return 1-2*  self.z(*inputs)  * np.sum(self.delta(value, *inputs)**2)
    
    def z(self, *inputs):
        s,a,_ = inputs
        return (1/2)*(np.sum(self.n[s,a])+1)**2    

    def delta(self, value=1, *inputs):
        S = self.n.shape[0]
        s,a,s_ = inputs
        d = lambda k: (
                (value-self.v[s,a,k])/(np.sum(self.n[s,a])+1) 
                    if k==s_ else 
                (0-self.v[s,a,k])/(np.sum(self.n[s,a])+1)
            )
        return np.array([d(k) for k in range(S)])

    
    def train(self, value=1, *inputs):
        S = self.n.shape[0]
        s,a,s_ = inputs

        self.n[inputs] = min(self.n[inputs]+1, self.M)
        delta = self.delta(value, *inputs)
        for k in range(S):
            self.v[s,a,k] += delta[k]

# Modelo
#### Qualidade Instantanea modelo:  


$E_m = E_m + \rho(e_m - E_m)$

$\rho = 1$

$e_m = c_m(s,a) (\Omega e^R_m + (1-\Omega)e^T_m)$

$c_m(s,a) = \frac{N_m(s,a)}{M}$  
$\Omega = 0$



In [5]:
class Model:
    def __init__(self, S,A, Omega=.5, M=1e2, rho=.9):
        self.S = S
        self.A = A
        self.Omega = Omega
        self.M = M
        self.rho = rho
        self.N = 0

        self._E = 0 

        self.t = Estimator_T(self.M, len(self.S), len(self.A) ,len(self.S))
        self.r = Estimator(self.M, len(self.S), len(self.A) ,len(self.S))

    def e(self, s,a,s_,r):
        nm = min(np.sum(self.t.n[s,a])+1, self.t.M)
        cm = nm/self.t.M
        return cm*(self.Omega * self.r.e(r, s,a,s_) + (1-self.Omega) * self.t.e(1, s,a,s_))
    
    def E(self, s,a,s_,r):
        self._E += self.rho*(self.e(s,a,s_,r) - self._E)
        return self._E

    def learn(self, s,a,s_,r):
        self.N += 1 
        self.t.train(1, s,a,s_)
        self.r.train(r, s,a,s_)
    
    def simulate(self, s, a): 
        s_ = np.random.choice(len(self.S), p=softmax(self.t.forward(s,a)))
        r = self.r.forward(s,a,s_)
        return s_,r
        
    def T(self, s,a,s_): # predict_T
        return self.t.forward(s,a,s_)
    def R(self, s,a,s_):
        return self.r.forward(s,a,s_)

model = Model(range(10), range(4))

In [6]:
class RLCD:
    def __init__(self, S,A, Emin=-0.01, Omega=0, M=1e2):
        self.S = S
        self.A = A
        self.Models = []
        self.current_model = None
        self.Emin = Emin
        self.Omega = Omega
        self.M = M
        self.new_model()

    def new_model(self, M=1e2):
        self.current_model = Model(self.S, self.A, self.Omega, self.M)
        self.Models.append(self.current_model)

    def learn(self, s,a,s_,r, log=False):
        E = [m.E(s,a,s_,r) for m in self.Models]
        self.current_model = self.Models[ np.argmax(E) ]
        new_model = self.current_model._E < self.Emin
        
        if log:
            print('E: ', E, new_model)
        if new_model:
            self.new_model()

        self.current_model.learn(s,a,s_,r)
    
    def simulate(self, s, a):
        sims = [m.simulate(s,a) for m in self.Models]
        E = [m.E(s,a,s_,r) for m, (s_,r) in zip(self.Models, sims)]
        return sims[ np.argmax(E) ]


model = RLCD(range(10), range(4))

# Simulação

In [7]:
d1 = np.random.choice(2, 10, p=[.2, .8])
print(d1)

d2 = np.random.choice(2, 10, p=[.5, .5])
print(d2)

d = np.concatenate([d1,d2])
print(d)

[1 1 1 1 1 1 0 1 1 1]
[0 0 1 1 0 1 0 0 0 1]
[1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1]


In [8]:
c1 = .7
n = 100
data = np.random.choice(2, n, p=[1-c1, c1])

print(data.sum()/data.size)
data

0.53


array([0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [11]:
est = Estimator_T(1e6, 2, 1, 2)
s = None
for s_ in data:
    if s is None:
        pass
    else:
        est.train(1, s,0,s_)
        print(s, s_, est.e(1,s,0,s_))
    s = s_

est.forward(0, 0, 1)

0 0 0.75
0 1 0.4444444444444444
1 1 0.75
1 0 0.4444444444444444
0 0 0.6875
0 1 0.48
1 0 0.6875
0 0 0.6388888888888888
0 1 0.4897959183673469
1 1 0.48
1 0 0.6388888888888888
0 0 0.609375
0 1 0.49382716049382713
1 0 0.7346938775510203
0 0 0.5900000000000001
0 0 0.6611570247933884
0 0 0.7152777777777777
0 1 0.33136094674556216
1 0 0.796875
0 1 0.4234693877551021
1 1 0.24691358024691357
1 0 0.75
0 0 0.6222222222222222
0 1 0.43359375
1 0 0.7933884297520661
0 1 0.4982698961937716
1 1 0.2152777777777778
1 1 0.33136094674556194
1 1 0.423469387755102
1 0 0.6222222222222222
0 1 0.5524691358024691
1 1 0.4335937499999999
1 0 0.6089965397923877
0 0 0.49861495844875336
0 1 0.5475
1 0 0.6512345679012348
0 1 0.5895691609977325
1 0 0.6869806094182827
0 1 0.6260330578512399
1 0 0.7175
0 0 0.40831758034026444
0 1 0.6163194444444444
1 0 0.743764172335601
0 0 0.4159999999999999
0 0 0.4600591715976331
0 1 0.5706447187928669
1 0 0.7665289256198348
0 1 0.6007653061224489
1 1 0.20415879017013217
1 0 0.74826388

0.5319148936170213

In [12]:
model = RLCD([0,1], [0])

s = None
for s_ in data:
    if s is None:
        pass
    else:
        model.learn(s,0,s_,0, log=True)
    s = s_

model.simulate(0,0)[0]

E:  [0.0] False
E:  [-0.0045000000000000005] False
E:  [-0.0004499999999999999] False
E:  [-0.004545] False
E:  [0.0115455] False
E:  [0.00790455] False
E:  [0.012790455] False
E:  [0.0228790455] False
E:  [0.018787904549999998] False
E:  [0.008628790454999999] False
E:  [0.0224628790455] False
E:  [0.03310343076169286] False
E:  [0.02918534307616929] False
E:  [0.03741853430761693] False
E:  [0.04374185343076169] False
E:  [0.057474185343076176] False
E:  [0.07120196398885306] False
E:  [0.030370196398885302] False
E:  [0.04932273392560282] False
E:  [0.043701504161791055] False
E:  [0.007745150416179104] False
E:  [0.0567745150416179] False
E:  [0.07703459436130465] False
E:  [0.05570345943613044] False
E:  [0.07307034594361306] False
E:  [0.06974453459436131] False
E:  [0.013519908004890668] False
E:  [0.024601990800489068] False
E:  [0.04122942984927966] False
E:  [0.07548008584207083] False
E:  [0.08378330270185413] False
E:  [0.056378330270185406] False
E:  [0.08607533302701856] 

0.0

In [42]:
n = 1000
sum([model.simulate(0,0)[0] for _ in range(n)])/n

0.524