In [1]:
import numpy as np
# import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from src.river import River

# Definição do Esimador:


In [2]:
class Estimator(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=10):
        super(Estimator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax(dim=-1)

    def train(self, loss, optimizer):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        # x = self.softmax(x)
        return x

class EstimatorT(Estimator):
    def __init__(self, input_size, output_size, hidden_size=10):
        super(EstimatorT, self).__init__(input_size, output_size, hidden_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = super(EstimatorT, self).forward(x)
        x = self.softmax(x)
        return x

# Modelo
#### Estimadores: 
 
$ \{ \hat{T}, \hat{R} \} $ 

In [3]:
class Model:
    def __init__(self, S,A, learning_rate=0.01):
        self.S = S
        self.A = A

        self.t_estimator = EstimatorT(len(S)+len(A), len(S))
        
        self.build_t()

        self.optimizer_t = optim.SGD(self.t_estimator.parameters(), lr=learning_rate)
        self.loss_t = nn.NLLLoss()
        self.estimatives_t = None
        
        self.r = np.zeros((len(S),len(A),len(S)))

    def build_input(self, s, a):
        return torch.cat([
            nn.functional.one_hot(torch.tensor([s]), num_classes=len(self.S)),
            nn.functional.one_hot(torch.tensor([a]), num_classes=len(self.A))
        ], dim=1).float()
    
    def build_t(self):
        with torch.no_grad():
            p = lambda s,a : self.t_estimator.forward(self.build_input(s,a))
            self.t = np.array([[p(s,a)[0].tolist() for a in self.A] for s in self.S]) 


    def learn(self, s,a,s_,r):
        # Learn T model
        obs = torch.tensor((s,a,s_,r)).float().unsqueeze(0)
        obs_t = obs[:,2]
        pred_t = self.estimatives_t
        lt = self.loss_t(pred_t, obs_t.long())
        self.t_estimator.train(lt, self.optimizer_t)
        self.estimatives_t = None

        self.build_t()
        
        # Learn R model
        self.r[s,a,s_] = r
    
    def predict(self, s, a, register=False):
        def estimate_s():
            x = self.build_input(s,a)
            p = self.t_estimator.forward(x)
            distribution = torch.distributions.Categorical(p)
            s_ = distribution.sample()
            # log_prob = distribution.log_prob(s_)
            return s_, torch.log(p)

        if register:
            s_, log_probs = estimate_s()
            self.estimatives_t = log_probs
        else:
            with torch.no_grad():
                s_, _ = estimate_s()

        return s_.item()
    
    def T(self, s,a,s_):
        return self.t[s,a,s_]
    
    def R(self, s,a,s_):
        return self.r[s,a,s_]

# model = Model(range(10), range(4))

# Algorítimo de Controle (RL):
 - Value Interation

In [4]:
class Value_Iteration:
    def __init__(self, model, gamma=.9, epsilon=.1):
        self.model = model
        self.gamma = gamma
        self.epsilon = epsilon
        self.v = np.zeros(self.model.S.size)
        self.control()

    def bellman(self, s,a):
        return sum([self.model.T(s,a,s_)*(self.model.R(s,a,s_)+self.gamma*self.v[s_]) for s_ in self.model.S])
    
    def run(self):
        while True:
            v = self.v.copy()
            for s in self.model.S:
                self.v[s] = np.max([self.bellman(s,a) for a in self.model.A])
            if np.linalg.norm(v-self.v, ord=np.inf) < (self.epsilon*(1-self.gamma))/(2*self.gamma):
                break

    def control(self):
        self.pi = [np.argmax([np.sum([self.model.T(s,a,s_)*(self.model.R(s,a,s_) + self.gamma*self.v[s_]) for s_ in self.model.S]) for a in self.model.A]) for s in self.model.S]

# Agente

In [5]:
class Agent:
    def __init__(self, S, A, gamma=.9, epsilon=.1, learning_rate=0.01):
        self.gamma = gamma
        self.epsilon = epsilon
        self.model = Model(S, A, learning_rate=learning_rate)
        self.rl = Value_Iteration(self.model, self.gamma, self.epsilon)
    
    def learn(self, s,a,s_,r):
        self.model.learn(s,a,s_,r)
        self.rl.run()
        self.rl.control()

    def act(self, s):
        pi = self.get_policy()
        self.model.predict(s, pi[s], register=True)
        return pi[s]
    def evaluate(self, s):
        v = self.get_v()
        return v[s]
    
    def get_v(self):
        return self.rl.v
    def get_policy(self):
        return self.rl.pi
                

# Simulação

In [6]:
def generate_episode(env,agent, size_limit=100):
    data = []
    env.reset()
    for t in range(size_limit):
        # print('t: ',t)
        s, _, _, _, _ = env.last()
        a = agent.act(s)
        s_, r, end, _, _ = env.step(s,a)
        
        step = (s,a,s_,r) 
        agent.learn(*step) 
        data.append(step)
        if end: break
    return data

def experiment(env, agent, max_iterations=1000, episode_sizes=100):
    data = []
    for i in range(max_iterations):
        epi = generate_episode(env, agent, episode_sizes)
        # print(f'{i}: {len(epi)}')
        data.append(len(epi))    
    return data



# Experimentação

In [7]:
env = River()

learning_rate=0.1
epsilon = 0.1
gamma = 0.9
num_episodes = 100
episode_size = 12

agent = Agent(env.S, env.A, gamma, epsilon)

In [8]:
s,a,s_=0,0,0
# p = [agent.model.T(s,a,s_) for a in env.A]
p = [np.round(agent.model.T(s,a,s_),2) for s_ in env.S]
print(agent.model.predict(s,a))
print(p)
sum(p)

6
[0.09, 0.13, 0.07, 0.07, 0.07, 0.1, 0.15, 0.12, 0.08, 0.12]


1.0

In [9]:
exp = experiment(env, agent, num_episodes, episode_size)
# exp

In [10]:
s,a,s_=0,0,0
# p = [agent.model.T(s,a,s_) for a in env.A]
p = [np.round(agent.model.T(s,a,s_),2) for s_ in env.S]
print(agent.model.predict(s,a))
print(p)
sum(p)

5
[0.16, 0.22, 0.05, 0.03, 0.03, 0.27, 0.09, 0.06, 0.05, 0.04]


1.0

In [11]:
env.plot(list(np.round(agent.get_v(), 1)))
env.plot(agent.get_policy(), True)

 _____________________________ 
| -2.1| -2.1| -2.0| -2.0| -2.0|
|_____|_____|_____|_____|_____|
| -2.2| -2.4| -2.3| -2.3| -1.9|
|_____|_____|_____|_____|_____|

 _____________________________ 
|  ←  |  →  |  ↑  |  →  |  ↑  |
|_____|_____|_____|_____|_____|
|  ↓  |  →  |  ↑  |  ↑  |  ↓  |
|_____|_____|_____|_____|_____|

