In [1]:
import numpy as np
# import pandas as pd

from src.river import River

# Definição do Esimador:

Frequência Relativa: 

$\hat{T}(s,a,s') = \frac{N_{s,a,s'} }{N_{s,a}+1}$ 

$\hat{R}(s,a,s') = \frac{\sum r_{s,a,s'} }{N_{s,a}+1}$

In [2]:
class Estimator:
    def __init__(self, *inputs):
        self.n = np.zeros(inputs)        
        self.v = np.zeros(inputs)        
    def train(self, value=1, *inputs):
        self.n[inputs] += 1
        self.v[inputs] += value
    def forward(self, *inputs):
        avg_value = (self.v[inputs] / (self.n[inputs] + 1))
        amount = (self.n[inputs] / (np.sum(self.n, axis=2)[inputs[:-1]] + 1))
        return avg_value * amount

model = Estimator(10, 4)

# Modelo
#### Estimadores:  
$\{ \hat{T}, \hat{R} \}$ 

In [3]:
class Model:
    def __init__(self, S,A):
        self.S = S
        self.A = A
        self.t = Estimator(len(S), len(A) ,len(S))
        self.r = Estimator(len(S), len(A) ,len(S))

    def learn(self, s,a,s_,r):
        self.t.train(1, s,a,s_)
        self.r.train(r, s,a,s_)
    
    def T(self, s,a,s_):
        return self.t.forward(s,a,s_)
    def R(self, s,a,s_):
        return self.r.forward(s,a,s_)

model = Model(range(10), range(4))

# Algorítimo de Controle (RL):
 - Value Interation

In [4]:
class Value_Iteration:
    def __init__(self, model, gamma=.9, epsilon=.1):
        self.model = model
        self.gamma = gamma
        self.epsilon = epsilon
        self.v = np.zeros(self.model.S.size)

    def bellman(self, s,a):
        return sum([self.model.T(s,a,s_)*(self.model.R(s,a,s_)+self.gamma*self.v[s_]) for s_ in self.model.S])
    
    def run(self):
        while True:
            v = self.v.copy()
            for s in self.model.S:
                self.v[s] = np.max([self.bellman(s,a) for a in self.model.A])
            if np.linalg.norm(v-self.v, ord=np.inf) < (self.epsilon*(1-self.gamma))/(2*self.gamma):
                break

    def control(self):
        return [np.argmax([np.sum([self.model.T(s,a,s_)*(self.model.R(s,a,s_) + self.gamma*self.v[s_]) for s_ in self.model.S]) for a in self.model.A]) for s in self.model.S]

# Agente

In [5]:
class Agent:
    def __init__(self, S, A, gamma=.9, epsilon=.1):
        self.gamma = gamma
        self.epsilon = epsilon
        self.model = Model(S, A)
        self.rl = Value_Iteration(self.model, self.gamma, self.epsilon)
    
    def learn(self, s,a,s_,r):
        self.model.learn(s,a,s_,r)
        self.rl.run()

    def act(self, s):
        pi = self.get_policy()
        return pi[s]
    def evaluate(self, s):
        v = self.get_v()
        return v[s]
    
    def get_v(self):
        return self.rl.v
    def get_policy(self):
        return self.rl.control()
                

# Simulação

In [6]:
def generate_episode(env,agent, size_limit=100):
    data = []
    env.reset()
    for _ in range(size_limit):
        s, _, _, _, _ = env.last()
        a = agent.act(s)
        s_, r, end, _, _ = env.step(s,a)
        
        step = (s,a,s_,r) 
        agent.learn(*step) 
        data.append(step)
        if end: break
    return data

def experiment(env, agent, max_iterations=1000, episode_sizes=100):
    data = []
    for _ in range(max_iterations):
        epi = generate_episode(env, agent, episode_sizes)
        data.append(len(epi))    
    return data



# Experimentação

In [28]:
env = River()

epsilon = 0.1
gamma = 0.9
num_episodes = 100
episode_size = 12

agent = Agent(env.S, env.A, gamma, epsilon)

In [31]:
exp = experiment(env, agent, num_episodes, episode_size)
# exp

In [32]:
env.plot(list(np.round(agent.get_v(), 2)))
env.plot(agent.get_policy(), True)

 _____________________________ 
|-3.89|-3.28| -2.6|-1.83|-0.97|
|_____|_____|_____|_____|_____|
|-4.42|-3.85|-3.13|-2.23| 0.0 |
|_____|_____|_____|_____|_____|

 _____________________________ 
|  →  |  →  |  →  |  →  |  ↓  |
|_____|_____|_____|_____|_____|
|  ↑  |  →  |  →  |  →  |  ↑  |
|_____|_____|_____|_____|_____|



In [33]:
agent.model.T(0,0,5)

0.9864408348373362