In [3]:
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

# Load the TensorBoard notebook extension
%load_ext tensorboard

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
#features = zip(governor_seq,combination_seq)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(buffer_size=1).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 512,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=20
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(.0005),#optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy'],
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(
    train_data,
    batch_size = 5000,
    epochs = 15,
    validation_data = val_data,
    verbose=2,
    callbacks=[tensorboard_callback]
)


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
129
Epoch 1/15
184/184 - 65s - loss: 0.4340 - accuracy: 0.7632 - val_loss: 1.4162 - val_accuracy: 0.0641
Epoch 2/15
184/184 - 38s - loss: 0.4071 - accuracy: 0.7714 - val_loss: 1.1890 - val_accuracy: 0.1494
Epoch 3/15
184/184 - 38s - loss: 0.3954 - accuracy: 0.7719 - val_loss: 1.1080 - val_accuracy: 0.2033
Epoch 4/15
184/184 - 38s - loss: 0.3895 - accuracy: 0.7743 - val_loss: 0.8815 - val_accuracy: 0.3926
Epoch 5/15
184/184 - 38s - loss: 0.3826 - accuracy: 0.7771 - val_loss: 0.8536 - val_accuracy: 0.4214
Epoch 6/15
184/184 - 38s - loss: 0.3766 - accuracy: 0.7797 - val_loss: 0.8383 - val_accuracy: 0.4384
Epoch 7/15
184/184 - 39s - loss: 0.3719 - accuracy: 0.7801 - val_loss: 0.8358 - val_accuracy: 0.4378
Epoch 8/15
184/184 - 38s - loss: 0.3681 - accuracy: 0.7824 - val_loss: 0.8525 - val_accuracy: 0.4152
Epoch 9/15
184/184 - 38s - loss: 0.3652 - accuracy: 0.7839 - val_loss: 0.8512 - val_accuracy: 0.41

<tensorflow.python.keras.callbacks.History at 0x29245f7be50>

In [4]:
model.summary()

Model: "outer_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_model_2 (InnerModel)   multiple                  727893    
_________________________________________________________________
distance_layer_2 (DistanceLa multiple                  0         
_________________________________________________________________
output_layer (Dense)         multiple                  0 (unused)
Total params: 727,893
Trainable params: 727,813
Non-trainable params: 80
_________________________________________________________________


In [36]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")

    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'a.p. lutali' and 'a.p. lutali'
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
Representation of A: [[ 0.404   2.0065  2.9575 -0.1137 -1.0178  0.5598  0.3647 -1.7628  1.1018
   3.4374 -1.4547  1.1885  0.5285 -0.6939  1.3511 -1.2471  2.2164  0.7676
  -1.4943  0.7016  0.1359  1.3568 -2.3191 -2.0823  2.3083  0.0361  1.0941
   1.0824  0.7116 -0.2978 -0.9097 -0.2465 -0.1202  2.3228  0.0617  1.3254
  -0.2052 -0.8126  0.4394 -0.4793]]
Representation of B: [[ 0.404   2.0065  2.9575 -0.1137 -1.0178  0.5598  0.3647 -1.7628  1.1018
   3.4374 -1.4547  1.1885  0.5285 -0.6939  1.3511 -1.2471  2.2164  0.7676
  -1.4943  0.7016  0.1359  1.3568 -2.3191 -2.0823  2.3083  0.0361  1.0941
   1.0824  0.7116 -0.2978 -0.9097 -0.2465 -0.1202  2.3228  0.0617  1.3254
  -0.2052 -0.8126  0.4394 -0.4793]]
Simila

In [6]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'a.p. lutali' and 'lutali a.p.'
Representation of A: [[ 0.404   2.0065  2.9575 -0.1137 -1.0178  0.5598  0.3647 -1.7628  1.1018
   3.4374 -1.4547  1.1885  0.5285 -0.6939  1.3511 -1.2471  2.2164  0.7676
  -1.4943  0.7016  0.1359  1.3568 -2.3191 -2.0823  2.3083  0.0361  1.0941
   1.0824  0.7116 -0.2978 -0.9097 -0.2465 -0.1202  2.3228  0.0617  1.3254
  -0.2052 -0.8126  0.4394 -0.4793]]
Representation of B: [[ 0.4329  1.9469  2.4533  0.2115 -0.8506  0.1278  0.28   -1.5503  1.0698
   2.2854 -1.0248  0.7821  0.4384 -0.8719  0.8447 -0.787   2.5411  0.5527
  -1.4668  1.2208 -0.0376  1.5357 -1.6888 -1.5673  1.6692 -0.1066  0.6947
   0.8363  0.3627 -0.777  -1.4921 -0.5233  0.0381  2.0118  0.3562  1.0246
  -0.3303 -0.9044  0.5201 -0.0217]]
Similarity: [0.9832]


In [10]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


a.p. lutali___________________
lutali a.p.___________________
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 7. 20. 13.  4.  7.  9.  2.  4. 25. 23. 25.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [31]:
my_test = ["Ekaterina Sharkova","Ekaterina Charkova"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)

Comparing 'Ekaterina Sharkova' and 'Ekaterina Charkova'
Representation of A: [[ 0.636   0.4515 -4.0612  2.2748  3.122  -2.1517 -0.3355  0.0622  0.5851
  -0.7621  1.2962  0.9377 -0.7083 -1.3391 -1.9118  2.1137  1.353  -0.0135
   0.7485 -0.5312 -0.3957  1.9793  0.6706 -0.5138 -0.5816  0.1241  0.8867
  -1.2174 -0.9282 -0.7831  0.9341 -0.3393  2.7399  0.7535 -0.0053  1.434
   0.0992 -1.1377 -1.5032  1.2273]]
Representation of B: [[ 0.0718  0.3868 -3.5062  1.8548  3.5418 -1.91    0.4717  1.1073  0.427
   1.5685  1.2931  2.3809 -0.6956 -2.8216 -0.9996  1.1498  0.3402  0.3267
   0.8637 -1.9577 -0.2696  1.545   1.1492 -0.8378  0.1516  0.8528  1.5849
  -1.0214 -1.8168  0.1694  1.1356  0.088   1.6198  0.7834 -1.4903  2.3448
   1.1858  0.1368 -2.0357 -0.6002]]
Similarity: [0.9048]


In [43]:
def check_similarity(input_a, input_b, model):
    input_a = np.array(input_a).reshape(1,len(input_a))
    input_b = np.array(input_b).reshape(1,len(input_b))
    model(input_a,input_b)

    return model.cosine_similarity

for i in range(100):
    pred_similarity = check_similarity(governor_seq[i],combination_seq[i],model)
    print("******************")
    print(f"Comparing '{governors_list[i]}' and '{combination_list[i]}'")
    print(f"Similarity: predicted={pred_similarity}, true={match[i]}")

[[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: outer_model_2/strided_slice/

In [11]:
for i in range(10):
    print(governor_seq[i])

[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.  5.  2.  8. 19. 14.  3.  5.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 4.  4.  6.  8.