In [1]:
import gym
import gym_watten
from gym_watten.envs.watten_env import Color, Value
import numpy as np
from time import sleep
import random

In [2]:
env = gym.make("Watten-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Environment '<class 'gym_watten.envs.watten_env.WattenEnv'>' has deprecated methods. Compatibility code invoked.[0m


# Search

In [3]:
def search(obs):
    global sample_outputs, sample_inputs, next_index
    state = env.get_state()
    n = len(env.players[env.current_player].hand_cards)   
    p = 0
    local_index = next_index
    next_index += 1
    sample_inputs[local_index] = np.array(obs)
    current_player = env.current_player
    
    for i in range(n):
        card_id = env.players[env.current_player].hand_cards[i].id
        obs, rew, is_done, _ = env.step(card_id)
        
        if is_done:
            sample_outputs[local_index][card_id] = (rew[0] > 0)
        else:
            sample_outputs[local_index][card_id] = ((1 - search(obs)) if current_player != env.current_player else search(obs))
        env.set_state(state)
        p += sample_outputs[local_index][card_id]
        
    return sample_outputs[local_index].max()

In [4]:
def reset_samples(iterations):
    global sample_outputs, sample_inputs,next_index
    sample_outputs = np.full((30000*iterations, 32), 0, dtype=float)
    sample_inputs = np.zeros((30000*iterations, 32 * 2 + 4), dtype=int)
    next_index = 0
    
def postprocess_samples():
    global sample_outputs, sample_inputs,next_index
    sample_inputs = sample_inputs[:next_index]
    sample_outputs = sample_outputs[:next_index]
    
    #exp = np.exp(sample_outputs)
    #exp[np.where(sample_outputs == -1)] = 0
    #sample_outputs = exp / np.expand_dims(np.sum(exp, axis=-1), -1)
    
def generate_training_data(iterations=1):
    global sample_outputs, sample_inputs,next_index
    reset_samples(iterations)
    
    for i in range(iterations):
        obs = env.reset()
        search(obs)    
        
    postprocess_samples()  
    return sample_inputs, sample_outputs

# Play

In [108]:
def match(agents, render=False):
    obs, is_done = env.reset(), False    
    
    while not is_done:
        prediction = agents[env.current_player].predict(np.expand_dims(obs, 0))[0]
        prediction_valid = np.ma.masked_where(obs[:32] == 0, prediction)
        obs, rew, is_done, _ = env.step(np.argmax(prediction_valid))
        
        if render:
            env.render('human')
            sleep(1)
        
    return env.current_player

def compare(agent1, agent2):
    agents = [agent1, agent2]
    first_player_wins = 0
    
    for i in range(1000):
        start_player = random.randint(0, 1)
        winner = match([agents[start_player], agents[1 - start_player]])
        first_player_wins += ((winner == 0) == (start_player == 0))
    
    return first_player_wins / 1000      

In [118]:
match([model, model], True)

1

In [119]:
compare(model, model)

0.498

# Train

In [52]:
from keras.models import Sequential, clone_model
from keras.layers import Dense, Activation
import keras.backend as K
from IPython.display import clear_output
from keras import optimizers
import math

In [113]:
# For a single-input model with 2 classes (binary classification):
def mean_pred(y_true, y_pred):
    return K.mean(K.equal(K.gather(y_true, K.argmax(y_true, axis=-1)), K.gather(y_true, K.argmax(y_pred, axis=-1))))

def acc(y_true, y_pred):
    return K.mean(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)))

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=68))
model.add(Dense(256, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(32, activation='sigmoid'))
adam = optimizers.Adam()
model.compile(optimizer=adam,
              loss='mean_squared_error',
              metrics=['accuracy', acc])


In [114]:
for i in range(10):
    prev_state = clone_model(model)
    generate_training_data(100)    
    p = np.random.permutation(len(sample_inputs))
    sample_inputs, sample_outputs = sample_inputs[p], sample_outputs[p]
    
    # Train the model, iterating on the data in batches of 32 samples
    model.fit(sample_inputs, sample_outputs, epochs=1, batch_size=64)
    
    print("New model wins " + str(compare(model, prev_state) * 100) + "%!")

Epoch 1/1
New model wins 71.89999999999999%!
Epoch 1/1
New model wins 72.3%!
Epoch 1/1
New model wins 68.0%!
Epoch 1/1
New model wins 71.3%!
Epoch 1/1
New model wins 71.2%!
Epoch 1/1
New model wins 65.10000000000001%!
Epoch 1/1
New model wins 70.5%!
Epoch 1/1
New model wins 72.3%!
Epoch 1/1
New model wins 72.39999999999999%!
Epoch 1/1
New model wins 70.8%!


In [115]:
model.save('model.h5')

In [7]:
model.load_weights('model.h5')

# Eval

In [None]:
generate_training_data(100)    
p = np.random.permutation(len(sample_inputs))
sample_inputs, sample_outputs = sample_inputs[p], sample_outputs[p]

In [310]:
model.evaluate(sample_inputs, sample_outputs)



[0.004186552935140255, 0.3855330858996649, 0.3855330858996649]

# Predict

In [8]:
def generate_input(hand_cards, table_card=None, own_tricks=0, opponent_tricks=0):
    obs = np.zeros((len(env.cards) * 2 + 4,))
    
    for card in hand_cards:
        obs[card.id] = 1
        
    if table_card is not None:
        obs[32 + table_card.id] = 1

    obs[-4] = (own_tricks == 1 or own_tricks == 3)
    obs[-3] = (own_tricks == 2 or own_tricks == 3)

    obs[-2] = (opponent_tricks == 1 or opponent_tricks == 3)
    obs[-1] = (opponent_tricks == 2 or opponent_tricks == 3)
    
    return obs

In [9]:
def card(color, value):
    for card in env.cards:
        if card.value == value and card.color == color:
            return card
        
    return None

In [10]:
def calc_correct_output_sample(hand_cards, table_card=None, own_tricks=0, opponent_tricks=0):
    global sample_outputs
    env.reset()
    env.cards_left = env.cards[:]
    random.shuffle(env.cards_left)
    
    for card in hand_cards:
        env.cards_left.remove(card)
    env.players[0].hand_cards = hand_cards[:]
        
    env.players[1].hand_cards = []
    for i in range(len(hand_cards) - (1 if table_card is not None else 0)):
        env.players[1].hand_cards.append(env.cards_left.pop())       
    
    env.players[0].tricks = own_tricks
    env.players[1].tricks = opponent_tricks
    env.table_card = table_card
    
    #obs, rew, is_done, _ = env.step(hand_cards[0].id)
   # print(rew, is_done)

    env.render('human')
    reset_samples(1)
    search(env.regenerate_obs())
    postprocess_samples()
        
    return sample_outputs[0]

In [11]:
def calc_correct_output(hand_cards, table_card=None, own_tricks=0, opponent_tricks=0):    
    correct_output = None
    for i in range(100):
        sample_outputs = calc_correct_output_sample(hand_cards, table_card, own_tricks, opponent_tricks)
        if correct_output is None:
            correct_output = sample_outputs
        else:
            correct_output += sample_outputs
    return correct_output / 100

In [12]:
def predict(hand_cards, table_card=None, own_tricks=0, opponent_tricks=0):
    input = generate_input(hand_cards, table_card, own_tricks, opponent_tricks)
    output = model.predict(np.expand_dims(input, 0))[0]
    correct_output = calc_correct_output(hand_cards, table_card, own_tricks, opponent_tricks)
    print(output)
    for card in hand_cards:
        print(str(card.color) + " " + str(card.value) + " => " + str(output[card.id]) + " (" + str(correct_output[card.id]) + ")")

In [13]:
predict([
    card(Color.EICHEL, Value.ACHT), 
    card(Color.EICHEL, Value.KOENIG),
    card(Color.HERZ, Value.SAU)
    ], None, 0, 2)


[5.75959403e-03 1.43175423e-01 5.59249427e-03 6.49866939e-04
 2.35658512e-02 9.19388514e-03 2.35199124e-01 1.05343026e-03
 1.97753645e-02 1.78710464e-03 1.46320984e-02 2.28701229e-03
 5.81715394e-05 3.54093849e-03 1.10634025e-02 4.99059772e-03
 2.45558605e-01 5.44387614e-03 9.99814365e-03 1.42739655e-03
 3.20930092e-04 2.36787274e-03 1.98115688e-03 9.53817507e-04
 9.92957852e-04 6.92562433e-04 3.14851701e-02 9.90882167e-04
 2.34370306e-03 7.74050690e-03 1.19732879e-02 4.65586130e-03]
Color.EICHEL Value.ACHT => 0.23519912 (0.38)
Color.EICHEL Value.KOENIG => 0.14317542 (0.38)
Color.HERZ Value.SAU => 0.2455586 (0.38)


In [14]:
predict([
    card(Color.EICHEL, Value.SAU), 
    card(Color.EICHEL, Value.KOENIG),
    card(Color.HERZ, Value.SAU),
    card(Color.HERZ, Value.ACHT)
    ],card(Color.EICHEL, Value.ZEHN), 1, 0)

[9.9907184e-01 9.8913771e-01 7.2538495e-09 3.3113597e-08 1.5161502e-05
 7.3705126e-09 5.1551261e-08 6.5132488e-05 9.8717465e-11 1.3444364e-05
 7.2913321e-07 2.0023890e-05 3.7138326e-09 4.8235627e-08 3.2651492e-08
 5.0104722e-05 1.1817461e-04 6.5148895e-05 6.6525336e-09 6.1969800e-08
 1.4236014e-05 2.0350743e-09 2.5213609e-02 6.2516587e-09 1.0997270e-06
 5.9662301e-07 2.6357413e-10 8.3598701e-09 1.4469356e-06 9.2596401e-06
 1.2046009e-10 4.0401463e-04]
Color.EICHEL Value.SAU => 0.99907184 (0.84)
Color.EICHEL Value.KOENIG => 0.9891377 (0.84)
Color.HERZ Value.SAU => 0.00011817461 (0.11)
Color.HERZ Value.ACHT => 0.025213609 (0.26)
