In [26]:
import numpy as np
import scipy as sp
# import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from src.river import River

# Definição do Esimador:


In [27]:
class Estimator(nn.Module):
    def __init__(self, inputs, outputs, hidden_size=10):
        super(Estimator, self).__init__()
        self.inputs = inputs
        self.outputs = outputs

        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()
        self.fc1 = nn.Linear(len(inputs), hidden_size)
        # self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, len(outputs))

    def encode_inputs(self, *data):
        # return torch.cat([
        #     nn.functional.one_hot(torch.tensor([x]), num_classes=size)
        #     for x, size in zip(data, self.inputs)
        # ], dim=1).float()
        return torch.tensor(data).float()
    def decode_output(self, data):
        return torch.clamp(data, min=0.0, max=sum(self.outputs)-1)

    def train(self, loss, optimizer):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    def forward(self, *inputs):
        x = self.encode_inputs(*inputs)
        x = self.fc1(x)
        x = self.relu(x)
        # x = self.fc2(x)
        # x = self.relu(x)
        x = self.fc3(x)
        # x = self.relu(x)
        # return self.decode_output(x)
        return x

# Modelo
#### Estimadores: 
 
$ \{ \hat{T}, \hat{R} \} $ 

In [28]:
class Model:
    def __init__(self, S,A, learning_rate=0.01):
        self.S = S
        self.A = A

        self.estimator = Estimator([len(S),len(A)], [len(S)])

        self.optimizer = optim.SGD(self.estimator.parameters(), lr=learning_rate)
        self.loss_function = nn.MSELoss()
        self.estimatives = None
        
        self.r = np.zeros((len(S),len(A),len(S)))
    
    def loss(self, *data):
        obs = torch.tensor(data).float().unsqueeze(0)
        obs = obs[:,2]
        # pred = (pred or torch.tensor([pred]).float()) 
        loss = self.loss_function(self.estimatives, obs)
        return loss

    def learn(self, s,a,s_,r):
        # Learn T model
        loss = self.loss(s,a,s_,r)
        self.estimator.train(loss, self.optimizer)
        # self.estimatives = None
        
        # Learn R model
        self.r[s,a,s_] = r
        return loss.item()
    
    def predict(self, s, a, register=False):
        if register:
            s_ = self.estimator.forward(s,a)
        else:
            with torch.no_grad():
                s_ = self.estimator.forward(s,a)
        
        self.estimatives = s_
        s_ = torch.round(torch.clamp(self.estimator.relu(s_), min=0.0, max=len(self.S)-1)).int().item()
        return s_, self.r[s,a, s_]

# model = Model(range(10), range(4))

# Algorítimo de Controle (RL):
 - Value Interation

In [29]:
class QLearning:
    def __init__(self, model, gamma=.9, alpha=.9, max_iter=10):
        self.model = model
        self.gamma = gamma
        self.alpha = alpha
        self.max_iter = max_iter
        self.Q = np.zeros((self.model.S.size, self.model.A.size))
        self.control()

    def max_random(self, x):
        return np.random.choice(np.flatnonzero(x == x.max()))

    def predict(self):
        s = np.random.choice(self.model.S)
        for i in range(self.max_iter):
            a = self.max_random(self.Q[s])
            s_, r = self.model.predict(s, a)

            delta = r + self.gamma*np.max([self.Q[s_,a_] for a_ in self.model.A]) - self.Q[s,a]
            self.Q[s,a] +=  self.alpha*delta

            s = s_

    def control(self):
        self.pi = [self.max_random(s) for s in self.Q]

# Agente

In [30]:
class Agent:
    def __init__(self, S, A, epsilon=0.3, gamma=.9, alpha=.9, max_iter=10, learning_rate=0.01):
        self.model = Model(S, A, learning_rate=learning_rate)
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.rl = QLearning(self.model, gamma, alpha, max_iter)
        self.err = np.ones((S.size, A.size))
        self.guess = None
    
    def learn(self, s,a,s_,r):
        need_improve = (self.err[s,a] > self.epsilon)
        if need_improve:
            for i in range(self.max_iter):
                pred = self.model.predict(s, a, register=need_improve)
                err = self.model.learn(s,a,s_,r)
                self.err[s,a] = err
                if self.err[s,a] < self.epsilon:
                    break
        else:
            self.err[s,a] = (self.guess-s_)**2
        
        self.rl.predict()
        self.rl.control()

    def act(self, s):
        pi = self.policy()
        a = pi[s]
        self.guess = self.model.predict(s, a)[0]
        return a
        
    def v(self):
        return [np.max(s) for s in self.rl.Q]
    def q(self):
        return self.rl.Q
    def policy(self):
        vals = sp.special.softmax(self.rl.Q, axis=1)
        weights = sp.special.softmax(self.err * vals, axis=1)
        pi = [np.random.choice(range(weights.shape[1]), p=w) for w in weights] 
        return pi
        # return self.rl.pi
                

# Simulação

In [31]:
def generate_episode(env,agent, size_limit=100):
    data = []
    env.reset()
    for t in range(size_limit):
        # print('t: ',t)
        # print('act')
        s, _, _, _, _ = env.last()
        a = agent.act(s)
        s_, r, end, _, _ = env.step(s,a)
        
        # print('learn')
        step = (s,a,s_,r) 
        agent.learn(*step) 
        data.append(step)
        
        # print('restart')
        if end: break
    return data

def experiment(env, agent, max_iterations=1000, episode_sizes=100):
    data = []
    for i in range(max_iterations):
        epi = generate_episode(env, agent, episode_sizes)
        # print(f'{i}: {len(epi)}')
        # if len(epi) < 12:
        #     print(f'{i}: {len(epi)}')
        data.append(epi)    
    return data



# Experimentação

In [32]:
env = River()

learning_rate=0.9
gamma = 0.9
alpha = 0.9
epsilon = 0.2
max_iter = 10
num_episodes = 100
episode_size = 12

agent = Agent(env.S, env.A, epsilon, gamma, alpha, max_iter)
print([agent.model.predict(0,a)[0] for a in env.A])

[0, 0, 0, 0]


In [39]:
print([agent.model.predict(0,a)[0] for a in env.A])
exp = experiment(env, agent, num_episodes, episode_size)
print([agent.model.predict(0,a)[0] for a in env.A])
# exp

[3, 2, 1, 0]
[2, 1, 0, 0]


In [34]:
print(np.array([[agent.model.predict(s,a)[0] for a in env.A] for s in env.S]))

[[3 0 0 0]
 [3 1 0 0]
 [3 3 0 0]
 [3 3 2 0]
 [3 3 3 0]
 [3 3 3 2]
 [3 3 3 3]
 [3 3 3 3]
 [3 3 3 3]
 [3 3 3 3]]


In [35]:
# np.round(agent.q(),1)

In [36]:
# env.plot([max(s) for s in np.round(agent.q(), 1)])
env.plot(list(np.round(agent.v(), 1)))
env.plot(agent.policy(), True)

 _____________________________ 
| -0.0| -0.0| -0.0| -0.0| -0.0|
|_____|_____|_____|_____|_____|
| -0.1| -0.0| -0.0| -0.0| -0.0|
|_____|_____|_____|_____|_____|

 _____________________________ 
|  ↓  |  ↓  |  →  |  ↑  |  ↓  |
|_____|_____|_____|_____|_____|
|  ↓  |  ↑  |  ↑  |  ↑  |  ↓  |
|_____|_____|_____|_____|_____|

