In [8]:
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

# Load the TensorBoard notebook extension
%load_ext tensorboard

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
#features = zip(governor_seq,combination_seq)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(buffer_size=1).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 512,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=10
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(.0005),#optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy'],
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(
    train_data,
    batch_size = 5000,
    epochs = 10,
    validation_data = val_data,
    verbose=2
)


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
129
Epoch 1/20
184/184 - 38s - loss: 0.4130 - accuracy: 0.7644 - val_loss: 1.7619 - val_accuracy: 0.0127
Epoch 2/20
184/184 - 22s - loss: 0.3976 - accuracy: 0.7720 - val_loss: 1.2197 - val_accuracy: 0.1271
Epoch 3/20
184/184 - 22s - loss: 0.3886 - accuracy: 0.7745 - val_loss: 1.1493 - val_accuracy: 0.1649
Epoch 4/20
184/184 - 23s - loss: 0.3834 - accuracy: 0.7764 - val_loss: 1.0827 - val_accuracy: 0.2050
Epoch 5/20
184/184 - 24s - loss: 0.3793 - accuracy: 0.7777 - val_loss: 1.0017 - val_accuracy: 0.2660
Epoch 6/20
184/184 - 24s - loss: 0.3758 - accuracy: 0.7788 - val_loss: 0.9234 - val_accuracy: 0.3398
Epoch 7/20
184/184 - 23s - loss: 0.3724 - accuracy: 0.7795 - val_loss: 0.8981 - val_accuracy: 0.3682
Epoch 8/20
184/184 - 24s - loss: 0.3690 - accuracy: 0.7805 - val_loss: 1.0070 - val_accuracy: 0.2498
Epoch 9/20
184/184 - 24s - loss: 0.3667 - accuracy: 0.7821 - val_loss: 1.1797 - val_accuracy: 0.12

KeyboardInterrupt: 

In [2]:
model.summary()

Model: "outer_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_model (InnerModel)     multiple                  28351     
_________________________________________________________________
distance_layer (DistanceLaye multiple                  0         
_________________________________________________________________
output_layer (Dense)         multiple                  0 (unused)
Total params: 28,351
Trainable params: 28,351
Non-trainable params: 0
_________________________________________________________________


In [2]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")

    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'a.p. lutali' and 'a.p. lutali'
Representation of A: [[ 0.2741 -0.2612  0.6495 -0.1202  0.2013  0.3611  0.2211 -0.4409  0.1153
  -0.0034 -0.0481  0.3694  0.2681  0.515  -0.0618  0.4761  0.1355 -0.2049
  -0.2096  0.4616 -0.291   0.2378 -0.6431  0.5671  0.3075  0.0215 -0.3837
   0.4026 -0.3267 -0.2087 -0.3812  0.0501 -0.1531 -0.6593 -0.0126 -0.032
   0.3341  0.3676  0.2514 -0.7199]]
Representation of B: [[ 0.2741 -0.2612  0.6495 -0.1202  0.2013  0.3611  0.2211 -0.4409  0.1153
  -0.0034 -0.0481  0.3694  0.2681  0.515  -0.0618  0.4761  0.1355 -0.2049
  -0.2096  0.4616 -0.291   0.2378 -0.6431  0.5671  0.3075  0.0215 -0.3837
   0.4026 -0.3267 -0.2087 -0.3812  0.0501 -0.1531 -0.6593 -0.0126 -0.032
   0.3341  0.3676  0.2514 -0.7199]]
Similarity: [1.]


In [3]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'a.p. lutali' and 'lutali a.p.'
Representation of A: [[ 0.2741 -0.2612  0.6495 -0.1202  0.2013  0.3611  0.2211 -0.4409  0.1153
  -0.0034 -0.0481  0.3694  0.2681  0.515  -0.0618  0.4761  0.1355 -0.2049
  -0.2096  0.4616 -0.291   0.2378 -0.6431  0.5671  0.3075  0.0215 -0.3837
   0.4026 -0.3267 -0.2087 -0.3812  0.0501 -0.1531 -0.6593 -0.0126 -0.032
   0.3341  0.3676  0.2514 -0.7199]]
Representation of B: [[ 0.2951 -0.2549  0.6355 -0.1027  0.1612  0.332   0.2141 -0.3889  0.0854
   0.0196 -0.0124  0.3481  0.301   0.4881 -0.0296  0.5126  0.079  -0.2122
  -0.1641  0.4921 -0.2613  0.2155 -0.5727  0.554   0.269   0.0165 -0.3845
   0.3364 -0.3155 -0.2353 -0.348   0.0062 -0.1459 -0.5769  0.0658  0.0145
   0.3806  0.4311  0.2408 -0.6873]]
Similarity: [0.9973]


In [None]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


In [4]:
my_test = ["Gregory Sharkov","Stéphane Borderies"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)

Comparing 'Gregory Sharkov' and 'Stéphane Borderies'
Representation of A: [[-0.1536  0.0799 -1.1383  0.2715 -0.9136 -0.4483  0.0166  0.3749  0.3229
  -0.3422  0.2881  0.0792  0.1198 -0.5286 -0.345  -0.2182 -0.4786  0.6196
   0.2056 -0.1858  0.0871 -0.491   0.5584 -0.1221 -0.1908 -0.4439  0.8004
  -0.2948  0.1721  0.315   0.0208  0.1995  0.2161 -0.0061  1.1665  0.0081
  -0.0053 -0.6013 -0.3415  1.0976]]
Representation of B: [[-0.5546 -0.384   0.7774  0.6529 -0.7038  0.3191  0.1358  0.1885  0.1437
   0.0901 -0.1822  0.3427 -0.2471 -0.3413 -0.5186 -0.1489 -0.3424  0.3619
   0.0165  0.5991 -0.0037  0.1953 -0.8871  0.3389  0.1168  0.436  -0.1708
  -0.0234  0.2109 -0.195   0.1955 -0.7718 -0.4131 -0.5639  0.3299  0.651
   0.7119  0.461  -0.4876 -0.4191]]
Similarity: [0.4462]


In [12]:
def check_similarity(input_a, input_b, model):
    return model(input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b)))

for i in range(100):
    pred_similarity = check_similarity(governor_seq[i],combination_seq[i],model)
    print("******************")
    print(f"Comparing '{governors_list[i]}' and '{combination_list[i]}'")
    print(f"Similarity: predicted={pred_similarity}, true={match[i]}")

InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: outer_model_1/strided_slice/

In [11]:
for i in range(10):
    print(governor_seq[i])

[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.