In [122]:
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

# Load the TensorBoard notebook extension
%load_ext tensorboard

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
combined_data = combined_data.sample(frac=1,random_state=20210826)
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
#features = zip(governor_seq,combination_seq)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(10).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 512,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=20
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy'],
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(
    train_data,
    batch_size = 300,
    epochs = 20,
    validation_data = val_data,
    verbose=1,
    callbacks=[tensorboard_callback]
)


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
129
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x292867ef9a0>

In [100]:
model.summary()

Model: "outer_model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_model_17 (InnerModel)  multiple                  727893    
_________________________________________________________________
distance_layer_17 (DistanceL multiple                  0         
_________________________________________________________________
output_layer (Dense)         multiple                  0 (unused)
Total params: 727,893
Trainable params: 727,813
Non-trainable params: 80
_________________________________________________________________


In [101]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")

    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'john rettie mckernan' and 'john buchanan floyd jr.'
Representation of A: [[-1.3208  0.1307 -0.4465  1.1626 -3.148   0.8169  0.4244  0.612   3.259
  -0.5686  1.49   -0.3718 -0.6499  0.1009  0.0876  0.3508  0.2078  0.0964
  -2.7689 -0.6047 -0.3442  1.1286  0.5143  1.7568  0.5248 -0.3557  2.5756
  -1.7353  1.7695 -1.4955 -0.2446  1.2948  1.4299  1.9564 -0.3315  0.9405
   0.2129 -2.0651 -0.6799  0.6264]]
Representation of B: [[ 0.4076 -0.9757  0.7026 -0.7496  0.6603  0.8874  0.1077 -1.3964 -0.2076
  -0.751   0.4648 -1.6072 -1.0946  0.5787 -0.6976  2.2232  0.0885 -0.1202
  -0.5592 -0.1287  0.46    0.161  -1.0562 -0.6464 -0.3776 -0.0592 -0.9303
  -0.1258 -0.7951  2.3596  0.2762 -0.8175 -0.7944  0.8527 -0.9418  0.8725
  -0.7786 -0.9704 -0.1182  1.6667]]
Similarity: [0.4452]


In [102]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'john rettie mckernan' and 'william prentice cooper'
Representation of A: [[-1.3208  0.1307 -0.4465  1.1626 -3.148   0.8169  0.4244  0.612   3.259
  -0.5686  1.49   -0.3718 -0.6499  0.1009  0.0876  0.3508  0.2078  0.0964
  -2.7689 -0.6047 -0.3442  1.1286  0.5143  1.7568  0.5248 -0.3557  2.5756
  -1.7353  1.7695 -1.4955 -0.2446  1.2948  1.4299  1.9564 -0.3315  0.9405
   0.2129 -2.0651 -0.6799  0.6264]]
Representation of B: [[ 1.3603 -1.4001  1.0532 -0.5196 -1.2627  1.4194 -0.2981 -1.8771 -0.503
  -0.6405 -0.1251 -0.3011 -0.0536 -1.149   0.3406  0.6015  0.9517  1.3658
  -0.8552  0.5402  0.4555 -0.0602  0.3811  1.4755 -0.5778 -0.5014 -1.9762
  -1.2807 -0.0241  0.0259  1.4159  1.0426  1.1341 -1.1876  0.4514  0.4223
  -1.0587 -1.3271  0.2407 -0.5279]]
Similarity: [0.5398]


In [10]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


a.p. lutali___________________
lutali a.p.___________________
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 7. 20. 13.  4.  7.  9.  2.  4. 25. 23. 25.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [109]:
my_test = ["Ekaterina Sharkova","Cheikh Hadrami"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)

Comparing 'Ekaterina Sharkova' and 'Cheikh Hadrami'
Representation of A: [[ 0.2232 -0.3767  0.5072 -0.3358  1.2884  0.4877  0.2491 -0.7119  0.4548
  -0.9594  1.3281 -1.0502  1.371  -0.682   1.7785  0.5327 -1.0162 -0.2418
  -1.6832 -0.3863 -0.1937  0.0553 -1.0644  0.4153  0.4825  0.5121  0.9026
  -1.9914 -0.0974 -0.6885 -0.4925  1.1071 -0.5015  0.8097 -1.1071 -0.021
  -1.0611 -0.7742 -0.387   0.0628]]
Representation of B: [[ 0.4911 -0.4785  0.8663 -0.4934 -0.2543  1.2706 -0.1465 -0.6643 -1.0524
  -0.1458 -0.312  -0.3207 -0.1141  0.279   0.539  -0.2365 -0.5454 -1.0229
  -0.9022  1.1737  0.5706  0.6029 -0.3093 -1.2241  0.8034  0.62    0.2492
   0.8321  0.4477 -0.0647  0.6715  0.019   0.1621  0.0676  0.3708 -0.8863
  -0.3696 -0.0912  0.4922 -0.141 ]]
Similarity: [0.5502]


In [111]:
def check_similarity(string_a, string_b, tokeniser, model, match=None, debug=False):
    input_seq = preprocess_list([string_a,string_b],tokeniser,30)
    input_seq = [x.reshape(1, len(x)) for x in input_seq]
    model((input_seq[0],input_seq[1]))

    if debug:
        print("********************************")
        print(f"Comparing: '{string_a}' and '{string_a}'")
        print(f"Predicted similarity = {model.cosine_similarity[0]}, true similarity = {match}")
    return np.round(model.cosine_similarity[0],4)

for i in range(50):
    check_similarity(governors_list[i],combination_list[i],tk, model,match[i],True)

# predictions = []
# for name_a, name_b, _match in zip(governors_list, combination_list, match):
#     predictions.append(check_similarity(name_a,name_b,tk,model,_match,False))

# print(predictions)


********************************
Comparing: 'john rettie mckernan' and 'john rettie mckernan'
Predicted similarity = 0.4451736807823181, true similarity = 0
********************************
Comparing: 'william j. janklow' and 'william j. janklow'
Predicted similarity = 0.48304665088653564, true similarity = 0
********************************
Comparing: 'william henry seward' and 'william henry seward'
Predicted similarity = 0.4278525412082672, true similarity = 0
********************************
Comparing: 'stevens thomson mason' and 'stevens thomson mason'
Predicted similarity = 0.9892182350158691, true similarity = 1
********************************
Comparing: 'william pinkney whyte' and 'william pinkney whyte'
Predicted similarity = 0.4097248315811157, true similarity = 0
********************************
Comparing: 'john larue helm' and 'john larue helm'
Predicted similarity = 0.4665018916130066, true similarity = 0
********************************
Comparing: 'william burton' and 'w

In [80]:
set((list(val_data.as_numpy_iterator())[0][-1]))

{0}

In [119]:
combined_data[combined_data.match==0].groupby(["combinations","governor"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,match
combinations,governor,Unnamed: 2_level_1
aaron ogden,aaron thomas bliss,1
aaron ogden,aaron venable brown,1
aaron ogden,david ogden watkins,1
aaron ogden,samuel aaron baker,1
aaron thomas bliss,aaron ogden,1
...,...,...
zenas perry moody,edward alysworth perry,1
zenas perry moody,madison starke perry,1
zenas perry moody,moody currier,1
zenas perry moody,oliver perry morton,1


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,6))
plt.scatter(predictions,match)
plt.xlabel("Predicted similarity")
plt.ylabel("Actual similarity")
plt.show()
