In [1]:
from gym import Env
from gym.spaces import Discrete, Box

import numpy as np
import random

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

from py4j.java_gateway import JavaGateway


In [2]:
# Conecta no servidor de treinamento

gateway = JavaGateway()
minitruco_java = gateway.entry_point


In [3]:
# Env customizado do gym que encapsupla o servidor

class MinitrucoEnv(Env):
    def __init__(self):
        # Ações que podemos tomar: jogar uma carta de valor 0 até uma carta de valor 14
        self.action_space = Discrete(15)

        # Espaço de observação: os 3 primeiros elementos são as cartas em mãos,
        # o restante são os valores das cartas na mesa, pontuações, etc.
        # Detalhes em SituacaoJogo.java#ranges e #toObservacao
        low, high = np.array(minitruco_java.ranges()).transpose()
        self.observation_space = Box(low=np.array(low), high=np.array(high), dtype=np.float32)

        self.episodio = None
        self.state = None

    def _get_obs(self):
        values = self.episodio.estado().split()
        return [float(val) for val in values]
        # return np.array(float_values[24:27], dtype=np.float32)

    def step(self, action):
        action = int(action)
        last_observation = self.state
        indice_carta = -1
        for i in range(3):
            if last_observation[i] == action:
                indice_carta = i
                break
        if indice_carta == -1:
            # Jogada inválida (não deve acontecer no Keras por causa do masking, mas
            # se rolar por conta do teste inicial, só ignora e mantém no mesmo estado)
            return last_observation, 0, False, {}

        self.episodio.executa(indice_carta)
        estado_str = self.episodio.estado()
        if estado_str == "EQUIPE 1 VENCEU":
            return last_observation, 1.0, True, {}
        if estado_str == "EQUIPE 2 VENCEU":
            return last_observation, -1.0, True, {}

        self.state = self._get_obs()
        ganhoPontosEquipe1 = self.state[13] - last_observation[13]
        ganhoPontosEquipe2 = self.state[14] - last_observation[14]

        # Dá 1 ponto de recompensa se a equipe 1 ganhou pontos e -1 se a equipe 2 ganhou pontos
        reward = ganhoPontosEquipe1 - ganhoPontosEquipe2
        terminated = False

        return self.state, reward, terminated, {}

    def render(self):
        pass

    def reset(self, seed=None, options=None):
        if self.episodio is not None:
            self.episodio.finaliza()

#         super().reset(seed=seed) # required by check_env

        self.episodio = minitruco_java.novoEpisodio()
        self.state = self._get_obs()

        return self.state


env = MinitrucoEnv()




In [4]:
# Roda alguns episódios com política "jogue uma carta aleatoriamente"

import time

episodes = 50
total_reward = 0
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    score = 0
    start = time.time()

    while not terminated:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, info = env.step(action)
        score+=reward
    total_reward += score
    print('Episode:{} Score:{} Time:{}s'.format(episode, score, time.time() - start))
print("Average reward per episode {}".format(total_reward/episodes))


Episode:1 Score:1.0 Time:0.44654011726379395s
Episode:2 Score:-1.0 Time:0.46459007263183594s
Episode:3 Score:-1.0 Time:0.45887279510498047s
Episode:4 Score:1.0 Time:0.44832515716552734s
Episode:5 Score:-1.0 Time:0.5011961460113525s
Episode:6 Score:1.0 Time:0.48083996772766113s
Episode:7 Score:-1.0 Time:0.45574307441711426s
Episode:8 Score:-1.0 Time:0.40296292304992676s
Episode:9 Score:-1.0 Time:0.49876880645751953s
Episode:10 Score:-9.0 Time:0.47785305976867676s
Episode:11 Score:1.0 Time:0.44848012924194336s
Episode:12 Score:-1.0 Time:0.4601871967315674s
Episode:13 Score:1.0 Time:0.386016845703125s
Episode:14 Score:-1.0 Time:0.45891785621643066s
Episode:15 Score:-1.0 Time:0.46082401275634766s
Episode:16 Score:1.0 Time:0.45673179626464844s
Episode:17 Score:-1.0 Time:0.49923205375671387s
Episode:18 Score:-1.0 Time:0.43932223320007324s
Episode:19 Score:-1.0 Time:0.4599759578704834s
Episode:20 Score:1.0 Time:0.48914003372192383s
Episode:21 Score:-16.0 Time:0.37054896354675293s
Episode:22 S

In [5]:
states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states[0])))
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 17)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                432       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 15)                375       
Total params: 1,407
Trainable params: 1,407
Non-trainable params: 0
_________________________________________________________________


In [6]:
class MaskingDQNAgent(DQNAgent):
    def __init__(self, *args, **kwargs):
        super(MaskingDQNAgent, self).__init__(*args, **kwargs)

    def compute_q_values(self, state):
        q_values = super().compute_q_values(state)
        mask = np.array([1 if self._is_action_valid(state, action) else -np.inf for action in range(self.nb_actions)])
        masked_q_values = q_values + mask
        return masked_q_values

    def _is_action_valid(self, state, action):
        return state[0][0] == action or state[0][1] == action or state[0][2] == action

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = MaskingDQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=1000, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# Original era 50K steps
dqn.fit(env, nb_steps=1_000_000, visualize=False, verbose=1)


2023-11-23 15:55:30.148507: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-23 15:55:30.162679: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f910d2185a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-11-23 15:55:30.162695: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Training for 1000000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
198 episodes - episode_reward: -1.152 [-16.000, 1.000] - loss: 183038.838 - mae: 1504.938 - mean_q: 2015.875

Interval 2 (10000 steps performed)
197 episodes - episode_reward: -1.005 [-16.000, 1.000] - loss: 523983159296.000 - mae: 1957453.375 - mean_q: 2668181.250

Interval 3 (20000 steps performed)
197 episodes - episode_reward: -1.005 [-15.000, 1.000] - loss: 107973061902336.000 - mae: 36252508.000 - mean_q: 49008188.000

Interval 4 (30000 steps performed)
198 episodes - episode_reward: -1.162 [-15.000, 1.000] - loss: 1534724153540608.000 - mae: 148422464.000 - mean_q: 198731248.000

Interval 5 (40000 steps performed)
200 episodes - episode_reward: -1.260 [-16.000, 1.000] - loss: 9274128250961920.000 - mae: 372014208.000 - mean_q: 495839776.000

Interval 6 (50000 steps performed)
202 episodes - episode_rewa

ValueError: probabilities contain NaN

In [None]:
scores = dqn.test(env, nb_episodes=50, visualize=False)
print("Average reward per episode: {}".format(np.mean(scores.history['episode_reward'])))


Testing for 50 episodes ...
Episode 1: reward: -10.000, steps: 45
Episode 2: reward: 1.000, steps: 56
Episode 3: reward: 1.000, steps: 51
Episode 4: reward: -1.000, steps: 40
Episode 5: reward: -1.000, steps: 56
Episode 6: reward: 1.000, steps: 54
Episode 7: reward: 1.000, steps: 59
Episode 8: reward: -1.000, steps: 51
Episode 9: reward: -1.000, steps: 53
Episode 10: reward: -1.000, steps: 54
Episode 11: reward: -1.000, steps: 56
Episode 12: reward: -1.000, steps: 51
Episode 13: reward: -1.000, steps: 46
Episode 14: reward: -1.000, steps: 55
Episode 15: reward: -1.000, steps: 43
Episode 16: reward: -1.000, steps: 46
Episode 17: reward: -1.000, steps: 57
Episode 18: reward: -7.000, steps: 54
Episode 19: reward: 1.000, steps: 60
Episode 20: reward: -1.000, steps: 44
Episode 21: reward: -1.000, steps: 38
Episode 22: reward: 1.000, steps: 52
Episode 23: reward: -1.000, steps: 53
Episode 24: reward: -1.000, steps: 56
Episode 25: reward: 1.000, steps: 49
Episode 26: reward: -1.000, steps: 48