In [19]:
import numpy as np
import gymnasium as gym

In [20]:
# Definir um ambiente customizado para controle de glicemia
class DiabetesEnv(gym.Env):
    def __init__(self):
        # Definindo o espaço de observação e ação
        self.action_space = gym.spaces.Discrete(3)  # [0: não faz nada, 1: tomar insulina, 2: fazer exercício]
        self.observation_space = gym.spaces.Discrete(10)  # Exemplo: 10 níveis de glicemia

        self.state = 5  # Estado inicial (nível de glicose médio)
        self.done = False

    def step(self, action):
        reward = 0
        # Simular mudanças no nível de glicose dependendo da ação
        if action == 0:  # Não fazer nada
            self.state += np.random.randint(-1, 2)  # Pequenas flutuações
        elif action == 1:  # Tomar insulina
            self.state -= np.random.randint(1, 3)  # Glicose cai
        elif action == 2:  # Fazer exercício
            self.state -= np.random.randint(1, 2)  # Glicose cai levemente

        # Garantir que o nível de glicose não fique negativo
        if self.state < 0:
            self.state = 0

        # Penalizar níveis de glicose muito altos ou muito baixos
        if self.state < 3:
            reward = -1  # Glicose muito baixa
        elif self.state > 7:
            reward = -1  # Glicose muito alta
        else:
            reward = 1  # Glicose estável

        self.done = self.state <= 0 or self.state >= 9  # Episódio termina se glicose estiver fora do intervalo

        return self.state, reward, self.done, {}

    def reset(self):
        self.state = 5  # Resetar o nível de glicose
        self.done = False
        return self.state, {}

def eps_greedy(Q, s, eps=0.1):
    if np.random.uniform(0, 1) < eps:
        return np.random.randint(Q.shape[1])
    else:
        return greedy(Q, s)

def greedy(Q, s):
    return np.argmax(Q[s])

def run_episodes(env, Q, num_episodes=100):
    tot_rew = []
    state, _ = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0
        while not done:
            next_state, rew, done, _ = env.step(greedy(Q, state))
            state = next_state
            game_rew += rew
            if done:
                state, _ = env.reset()
                tot_rew.append(game_rew)

    return np.mean(tot_rew)

In [21]:
def eps_greedy(Q, s, eps=0.1):
    if np.random.uniform(0, 1) < eps:
        return np.random.randint(Q.shape[1])
    else:
        return greedy(Q, s)

def greedy(Q, s):
    return np.argmax(Q[s])

In [22]:
def run_episodes(env, Q, num_episodes=100):
    tot_rew = []
    state, _ = env.reset()

    for _ in range(num_episodes):
        done = False
        game_rew = 0
        while not done:
            next_state, rew, done, _ = env.step(greedy(Q, state))
            state = next_state
            game_rew += rew
            if done:
                state, _ = env.reset()
                tot_rew.append(game_rew)

    return np.mean(tot_rew)

In [23]:
def Q_learning(env, lr=0.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.0001):
    nA = env.action_space.n
    nS = env.observation_space.n

    Q = np.zeros((nS, nA))
    games_reward = []
    test_rewards = []

    for ep in range(num_episodes):
        state, _ = env.reset()
        done = False
        tot_rew = 0
        if eps > 0.01:
            eps -= eps_decay

        while not done:
            action = eps_greedy(Q, state, eps)
            next_state, rew, done, _ = env.step(action)

            # Atualização do Q-learning
            Q[state][action] = Q[state][action] + lr * (rew + gamma * np.max(Q[next_state]) - Q[state][action])

            state = next_state
            tot_rew += rew
            if done:
                games_reward.append(tot_rew)

        if (ep % 300) == 0:
            test_rew = run_episodes(env, Q, 100)
            print(f"Episode:{ep:5d}  Eps:{eps:.4f}  Rew:{test_rew:.4f}")
            test_rewards.append(test_rew)

    return Q

In [24]:
if __name__ == '__main__':
    env = DiabetesEnv()
    # Definir uma semente para a geração de números aleatórios
    np.random.seed(42)
    Q_qlearning = Q_learning(env, lr=0.1, num_episodes=5000, eps=0.5, gamma=0.95, eps_decay=0.0001)
    print(Q_qlearning)

Episode:    0  Eps:0.4999  Rew:6.6800
Episode:  300  Eps:0.4699  Rew:19.0600
Episode:  600  Eps:0.4399  Rew:33.5900
Episode:  900  Eps:0.4099  Rew:30.1300
Episode: 1200  Eps:0.3799  Rew:33.9400
Episode: 1500  Eps:0.3499  Rew:33.6400
Episode: 1800  Eps:0.3199  Rew:35.6000
Episode: 2100  Eps:0.2899  Rew:39.7000
Episode: 2400  Eps:0.2599  Rew:35.8300
Episode: 2700  Eps:0.2299  Rew:42.1700
Episode: 3000  Eps:0.1999  Rew:40.5400
Episode: 3300  Eps:0.1699  Rew:36.3900
Episode: 3600  Eps:0.1399  Rew:39.4000
Episode: 3900  Eps:0.1099  Rew:33.4900
Episode: 4200  Eps:0.0799  Rew:19.6200
Episode: 4500  Eps:0.0499  Rew:27.3200
Episode: 4800  Eps:0.0199  Rew:38.2500
[[ 0.          0.          0.        ]
 [-1.38262463 -1.         -1.        ]
 [ 0.33708921 -1.57402799 -1.94729216]
 [ 4.88045743 -1.10838011  0.25137046]
 [ 9.38365397  2.0302528   5.87368261]
 [12.52525146  9.30185604  9.51110331]
 [13.84780889 11.2573141  12.58669453]
 [13.09458182 13.35575307 14.20964877]
 [ 1.61919981 13.70422941 