In [1]:
from gym import Env
from gym.spaces import Discrete, Box

import numpy as np
import random

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

from py4j.java_gateway import JavaGateway


In [2]:
# Conecta no servidor de treinamento

gateway = JavaGateway()
minitruco_java = gateway.entry_point


In [3]:
# Env customizado do gym que encapsupla o servidor

class MinitrucoEnv(Env):
    def __init__(self):
        # Ações que podemos tomar: jogar a carta 0, a carta 1 ou a carta 2
        self.action_space = Discrete(3)

        # Espaço de observação: os 3 primeiros elementos são as cartas em mãos,
        # o restante são os valores das cartas na mesa, pontuações, etc.
        # Detalhes em SituacaoJogo.java#ranges e #toObservacao
        low, high = np.array(minitruco_java.ranges()).transpose()
        self.observation_space = Box(low=np.array(low), high=np.array(high), dtype=np.float32)

        self.episodio = None
        self.state = None

    def _get_obs(self):
        values = self.episodio.estado().split()
        return [float(val) for val in values]
        # return np.array(float_values[24:27], dtype=np.float32)

    def step(self, action):
        action = int(action)
        last_observation = self.state
        cartaJogada = last_observation[action]
        if cartaJogada == -1:
            # Jogada inválida (não deve acontecer no Keras por causa do masking, mas
            # se rolar por conta do teste inicial, só ignora e mantém no mesmo estado)
            return last_observation, 0, False, {}

        self.episodio.executa(action)
        estado_str = self.episodio.estado()
        if estado_str == "EQUIPE 1 VENCEU":
            return last_observation, 1.0, True, {}
        if estado_str == "EQUIPE 2 VENCEU":
            return last_observation, -1.0, True, {}

        self.state = self._get_obs()

        reward = 0.0
        terminated = False

        return self.state, reward, terminated, {}

    def render(self):
        pass

    def reset(self, seed=None, options=None):
        if self.episodio is not None:
            self.episodio.finaliza()

#         super().reset(seed=seed) # required by check_env

        self.episodio = minitruco_java.novoEpisodio()
        self.state = self._get_obs()

        return self.state


env = MinitrucoEnv()




In [4]:
# Roda alguns episódios com política "jogue uma carta aleatoriamente"

import time

episodes = 50
total_reward = 0
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    score = 0
    start = time.time()

    while not terminated:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, info = env.step(action)
        score+=reward
    total_reward += score
    print('Episode:{} Score:{} Time:{}s'.format(episode, score, time.time() - start))
print("Average reward per episode {}".format(total_reward/episodes))


Episode:1 Score:-1.0 Time:0.48358702659606934s
Episode:2 Score:-1.0 Time:0.47749781608581543s
Episode:3 Score:1.0 Time:0.4682190418243408s
Episode:4 Score:1.0 Time:0.516963005065918s
Episode:5 Score:-1.0 Time:0.5109500885009766s
Episode:6 Score:-1.0 Time:0.5222787857055664s
Episode:7 Score:1.0 Time:0.4657111167907715s
Episode:8 Score:1.0 Time:0.5382301807403564s
Episode:9 Score:1.0 Time:0.4107491970062256s
Episode:10 Score:-1.0 Time:0.3027799129486084s
Episode:11 Score:-1.0 Time:0.40374207496643066s
Episode:12 Score:1.0 Time:0.473527193069458s
Episode:13 Score:1.0 Time:0.4430389404296875s
Episode:14 Score:-1.0 Time:0.45159912109375s
Episode:15 Score:1.0 Time:0.503068208694458s
Episode:16 Score:-1.0 Time:0.39084696769714355s
Episode:17 Score:-1.0 Time:0.4611020088195801s
Episode:18 Score:1.0 Time:0.5063929557800293s
Episode:19 Score:-1.0 Time:0.5153799057006836s
Episode:20 Score:-1.0 Time:0.4199531078338623s
Episode:21 Score:-1.0 Time:0.4751548767089844s
Episode:22 Score:-1.0 Time:0.412

In [5]:
states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states[0])))
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 27)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                672       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 75        
Total params: 1,347
Trainable params: 1,347
Non-trainable params: 0
_________________________________________________________________


In [6]:
class MaskingDQNAgent(DQNAgent):
    def __init__(self, *args, **kwargs):
        super(MaskingDQNAgent, self).__init__(*args, **kwargs)

    def compute_q_values(self, state):
        q_values = super().compute_q_values(state)
        mask = np.array([1 if self._is_action_valid(state, action) else -np.inf for action in range(self.nb_actions)])
        masked_q_values = q_values + mask
        return masked_q_values

    def _is_action_valid(self, state, action):
        return state[0][action] != -1

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = MaskingDQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# Original era 50K steps, vamos devagar
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)


2023-10-26 18:28:49.504254: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-26 18:28:49.525630: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f7f7bb0b680 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-10-26 18:28:49.525651: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Training for 50000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
    6/10000 [..............................] - ETA: 2:56 - reward: 0.0000e+00



197 episodes - episode_reward: -0.411 [-1.000, 1.000] - loss: 0.046 - mae: 1.259 - mean_q: 2.201

Interval 2 (10000 steps performed)
198 episodes - episode_reward: -0.232 [-1.000, 1.000] - loss: 0.014 - mae: 0.827 - mean_q: 1.346

Interval 3 (20000 steps performed)
198 episodes - episode_reward: -0.354 [-1.000, 1.000] - loss: 0.010 - mae: 0.585 - mean_q: 0.919

Interval 4 (30000 steps performed)
199 episodes - episode_reward: -0.367 [-1.000, 1.000] - loss: 0.010 - mae: 0.551 - mean_q: 0.822

Interval 5 (40000 steps performed)
 1417/10000 [===>..........................] - ETA: 3:03 - reward: -0.0014

In [None]:
scores = dqn.test(env, nb_episodes=50, visualize=False)
print("Average reward per episode: {}".format(np.mean(scores.history['episode_reward'])))


Testing for 50 episodes ...
Episode 1: reward: 1.000, steps: 55


Episode 2: reward: 1.000, steps: 52
Episode 3: reward: -1.000, steps: 43
Episode 4: reward: -1.000, steps: 60
Episode 5: reward: -1.000, steps: 49
Episode 6: reward: -1.000, steps: 55
Episode 7: reward: 1.000, steps: 54
Episode 8: reward: 1.000, steps: 51
Episode 9: reward: -1.000, steps: 49
Episode 10: reward: 1.000, steps: 44
Episode 11: reward: -1.000, steps: 49
Episode 12: reward: -1.000, steps: 42
Episode 13: reward: -1.000, steps: 50
Episode 14: reward: -1.000, steps: 51
Episode 15: reward: 1.000, steps: 49
Episode 16: reward: -1.000, steps: 40
Episode 17: reward: -1.000, steps: 60
Episode 18: reward: -1.000, steps: 42
Episode 19: reward: -1.000, steps: 59
Episode 20: reward: -1.000, steps: 50
Episode 21: reward: -1.000, steps: 39
Episode 22: reward: 1.000, steps: 54
Episode 23: reward: -1.000, steps: 53
Episode 24: reward: -1.000, steps: 44
Episode 25: reward: -1.000, steps: 50
Episode 26: reward: 1.000, steps: 53
Episode 27: reward: 1.000, steps: 55
Episode 28: reward: -1.000, 