In [1]:
import tensorflow as tf

from matchers import constant, dataset, metrics, utils

In [2]:
MAX_NAME_LENGTH = 30

### Load data

In [3]:
input_names, relevant_names, all_candidates = dataset.load_preprocess()

### Build token index mappings

In [4]:
char_to_idx_map, idx_to_char_map = utils.build_token_idx_maps()

### Convert names to ids

In [5]:
X_all_candidates = utils.convert_names_to_ids(all_candidates, char_to_idx_map, MAX_NAME_LENGTH)
X_all_candidates = utils.one_hot_encode(X_all_candidates, constant.VOCAB_SIZE + 1)

### Model

In [6]:
hidden_dim = 100

# Encoder
encoder_input = tf.keras.layers.Input(shape=(MAX_NAME_LENGTH, constant.VOCAB_SIZE + 1))
encoder_output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_dim, 
                                                                    return_sequences=False), 
                                               name='encoder')(encoder_input)
# Decoder
h = tf.keras.layers.RepeatVector(MAX_NAME_LENGTH)(encoder_output)
h = tf.keras.layers.LSTM(hidden_dim, return_sequences=True)(h)
decoder_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(constant.VOCAB_SIZE + 1, 
                                                                       activation='softmax'))(h)

# Model
autoencoder = tf.keras.models.Model(encoder_input, decoder_output)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam')

# Model for just the encoder
# Used after the autoencoder is fully trained
encoder_model = tf.keras.models.Model(inputs=autoencoder.inputs, 
                                      outputs=autoencoder.get_layer('encoder').output)

In [7]:
autoencoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30, 29)]          0         
_________________________________________________________________
encoder (Bidirectional)      (None, 200)               104000    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 30, 200)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 100)           120400    
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 29)            2929      
Total params: 227,329
Trainable params: 227,329
Non-trainable params: 0
_________________________________________________________________


In [8]:
autoencoder.fit(X_all_candidates, 
                X_all_candidates, 
                epochs=100,
                batch_size=512)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1675c3610>

### Evaluation

In [9]:
X_input_names = utils.convert_names_to_ids(input_names, char_to_idx_map, MAX_NAME_LENGTH)
X_input_names = utils.one_hot_encode(X_input_names, constant.VOCAB_SIZE + 1)

In [10]:
X_input_names_encoded = encoder_model.predict(X_input_names)
X_candidates_encoded = encoder_model.predict(X_all_candidates)

In [11]:
candidates = utils.get_candidates_batch(X_input_names_encoded, 
                                        X_candidates_encoded, 
                                        all_candidates,
                                        num_candidates=10)

In [12]:
candidate_names = candidates[:, :, 0]

### mAP @ 1

In [13]:
metrics.mean_avg_precision_k(relevant_names, candidate_names, 1)

0.5117225590367576

### mAP @ 3

In [14]:
metrics.mean_avg_precision_k(relevant_names, candidate_names, 3)

0.4140840293381946

### Demo

In [15]:
test_name = ['<schumacher>']
test_name_sequence = utils.convert_names_to_ids(test_name, char_to_idx_map, MAX_NAME_LENGTH)
test_name_one_hot = utils.one_hot_encode(test_name_sequence, constant.VOCAB_SIZE + 1)
test_name_embedding = encoder_model.predict(test_name_one_hot)

utils.get_candidates_batch(test_name_embedding, 
                           X_candidates_encoded,
                           all_candidates,
                           num_candidates=10)

array([[['<schumacker>', 0.9707528352737427],
        ['<stelmacher>', 0.9584547877311707],
        ['<schumaker>', 0.9551969766616821],
        ['<schurhamer>', 0.9541249871253967],
        ['<stillmacher>', 0.9501252770423889],
        ['<schacher>', 0.9501103162765503],
        ['<schmelcher>', 0.9477940797805786],
        ['<schumpert>', 0.9368616342544556],
        ['<schmicker>', 0.9283653497695923],
        ['<standacher>', 0.9281593561172485]]], dtype=object)