In [1]:
from gym import Env
from gym.spaces import Discrete, Box

import numpy as np
import random

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

from py4j.java_gateway import JavaGateway


In [2]:
# Conecta no servidor de treinamento

gateway = JavaGateway()
minitruco_java = gateway.entry_point


In [3]:
# Env customizado do gym que encapsupla o servidor

class MinitrucoEnv(Env):
    def __init__(self):
        # Ações que podemos tomar: jogar a carta 0, a carta 1 ou a carta 2
        self.action_space = Discrete(3)

        # Estados possíveis
        # - posições: 1=inferior, 2=direita, 3=superior, 4=esquerda
        # - equipes: 1=posições 1 e 3; 2=posições 2 e 4
        # - cartas: podem valer -1 (null), 0 (fechada) ou um valor de 1
        #           a 14, conforme o valor relativo delas (cartas normais de 1
        #           a 10, manilhas de 11 a 14)
        # - rodadas: 1 a 3
        # - resultado da rodada: a equipe que venceu (1 ou 2), 3 para empate ou -1 para rodada não conlcuída
        # - booleanos (ex.: podeFechada) são 0 ou 1
        # TBD posJogadorPedindoAumento (acho que não zera depois do aumento)
        # TBD tento mineiro (talvez só varie as recompensas, mas é preciso especificar)
        # TBD baralho limpo (provavelmente só vamos excluir o range 1-4)
        # self.observation_space = Tuple((
        #     self._Discrete(4, start=1),    # posJogador
        #     self._Discrete(2),             # baralhoSujo
        #     self._Discrete(2),             # podeFechada
        #     self._Discrete(3, start=1),    # numRodadaAtual
        #     self._Discrete(5, start=-1),   # resultadoRodada1
        #     self._Discrete(5, start=-1),   # resultadoRodada2
        #     self._Discrete(12, start=1),   # valorMao
        #     self._Discrete(13, start=0),   # valorProximaAposta
        #     self._Discrete(5, start=0),    # posJogadorPedindoAumento
        #     self._Discrete(4, start=1),    # posJogadorQueAbriuRodada
        #     self._Discrete(24, start=0),   # pontosEquipe1
        #     self._Discrete(24, start=0),   # pontosEquipe2
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada1Pos1
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada1Pos2
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada1Pos3
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada1Pos4
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada2Pos1
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada2Pos2
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada2Pos3
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada2Pos4
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada3Pos1
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada3Pos2
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada3Pos3
        #     self._Discrete(16, start=-1),  # cartaJogadaRodada3Pos4
        #     self._Discrete(16, start=-1),  # carta1Jogador
        #     self._Discrete(16, start=-1),  # carta2Jogador
        #     self._Discrete(16, start=-1)   # carta3Jogador
        # ))

        # Vamos começar apenas com as suas cartas
        low = [-1, -1, -1]
        high = [14, 14, 14]

        self.observation_space = Box(low=np.array(low), high=np.array(high), dtype=np.float32)

        self.episodio = None
        self.state = None

    def _get_obs(self):
        values = self.episodio.estado().split()
        float_values = [float(val) for val in values]
        return np.array(float_values[24:27], dtype=np.float32)

    def step(self, action):
        action = int(action)
        last_observation = self.state
        cartaJogada = last_observation[action]
        if cartaJogada == -1:
            # Jogada inválida
            return last_observation, 0, False, {}

        self.episodio.executa(action)
        estado_str = self.episodio.estado()
        if estado_str == "EQUIPE 1 VENCEU":
            return last_observation, 1.0, True, {}
        if estado_str == "EQUIPE 2 VENCEU":
            return last_observation, -1.0, True, {}

        self.state = self._get_obs()

        reward = 0.0
        terminated = False

        return self.state, reward, terminated, {}

    def render(self):
        pass

    def reset(self, seed=None, options=None):
        if self.episodio is not None:
            self.episodio.finaliza()

#         super().reset(seed=seed) # required by check_env

        self.episodio = minitruco_java.novoEpisodio()
        self.state = self._get_obs()

        return self.state


env = MinitrucoEnv()




In [4]:
# Roda alguns episódios com política "jogue uma carta aleatoriamente"

import time

episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    score = 0
    start = time.time()

    while not terminated:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} Time:{}s'.format(episode, score, time.time() - start))


Episode:1 Score:-1.0 Time:0.5253078937530518s
Episode:2 Score:1.0 Time:0.48291897773742676s
Episode:3 Score:-1.0 Time:0.43973469734191895s
Episode:4 Score:-1.0 Time:0.4212837219238281s
Episode:5 Score:1.0 Time:0.4514920711517334s
Episode:6 Score:-1.0 Time:0.4573090076446533s
Episode:7 Score:-1.0 Time:0.3747367858886719s
Episode:8 Score:-1.0 Time:0.42905402183532715s
Episode:9 Score:-1.0 Time:0.47049903869628906s
Episode:10 Score:-1.0 Time:0.25701308250427246s


In [5]:
states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states[0])))
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 3)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                96        
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 75        
Total params: 771
Trainable params: 771
Non-trainable params: 0
_________________________________________________________________


In [6]:
states[0]


3

In [8]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# Original era 50K steps, vamos devagar
dqn.fit(env, nb_steps=500, visualize=False, verbose=2)


Training for 500 steps ...
[ 4.  8. 13.]
2
[ 4.  8. -1.]
0
[ 8. -1. -1.]
2
[ 8. -1. -1.]
2
[ 8. -1. -1.]
2
[ 8. -1. -1.]
0
[2. 1. 9.]
0
[ 1.  9. -1.]
0
[ 9. -1. -1.]
0
[ 3.  2. 12.]
0
[ 2. 12. -1.]
0
[9. 8. 4.]
1




[ 9.  4. -1.]
1
[ 5. 12. 10.]
0
[12. 10. -1.]
1
[12. -1. -1.]
2
[12. -1. -1.]
2
[12. -1. -1.]
2
[12. -1. -1.]
2
[12. -1. -1.]
2
[12. -1. -1.]
2
[12. -1. -1.]
1
[12. -1. -1.]
1
[12. -1. -1.]
2
[12. -1. -1.]
0
[12.  5.  2.]
0
[ 5.  2. -1.]
1
[ 5. -1. -1.]
0
[ 7.  1. 10.]
0
[ 1. 10. -1.]
2
[ 1. 10. -1.]
1
[ 1. -1. -1.]
1
[ 1. -1. -1.]
0
[ 6. 10.  9.]
2
[ 6. 10. -1.]
0
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
1
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
1
[10. -1. -1.]
1
[10. -1. -1.]
0
[9. 5. 3.]
0
[ 5.  3. -1.]
0
[9. 3. 4.]
2
[ 9.  3. -1.]
0
[ 3. -1. -1.]
0
[9. 5. 5.]
0
[ 5.  5. -1.]
1
[ 5. -1. -1.]
0
[9. 4. 3.]
1
[ 9.  3. -1.]
1
[ 9. -1. -1.]
2
[ 9. -1. -1.]
2
[ 9. -1. -1.]
0
[13. 10.  9.]
2
[13. 10. -1.]
2
[13. 10. -1.]
2
[13. 10. -1.]
0
[10. -1. -1.]
2
[10. -1. -1.]
2
[10. -1. -1.]
0
[10. 10. 12.]
1
[10. 12. -1.]
0
[12. -1. -1.]
0
[ 1.  5. 12.]
1
[ 1. 12. -1.]
0
[1. 8. 8.]
0
[ 8.  8. -1.]
2
[ 8.  8. -1.]
2
[ 8.  8. -1.]
1
[ 8. -1

<tensorflow.python.keras.callbacks.History at 0x1217d9d10>

In [9]:
scores = dqn.test(env, nb_episodes=1, visualize=False)
print(np.mean(scores.history['episode_reward']))


Testing for 1 episodes ...
[4. 5. 7.]
0
[ 5.  7. -1.]
0
[4. 9. 9.]
0
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9.  9. -1.]
2
[ 9