In [1]:
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

# Load the TensorBoard notebook extension
%load_ext tensorboard

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
combined_data = combined_data.sample(frac=1,random_state=20210826)
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
#features = zip(governor_seq,combination_seq)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(10).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 512,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=20
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy'],
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(
    train_data,
    batch_size = 300,
    epochs = 20,
    validation_data = val_data,
    verbose=1,
    callbacks=[tensorboard_callback]
)


129
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1ae85446250>

In [2]:
model.summary()

Model: "outer_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_model (InnerModel)     multiple                  727893    
_________________________________________________________________
distance_layer (DistanceLaye multiple                  0         
_________________________________________________________________
output_layer (Dense)         multiple                  2         
Total params: 727,895
Trainable params: 727,815
Non-trainable params: 80
_________________________________________________________________


In [3]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    prediction = outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")
        print(f"Prediction: {prediction}")


    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'john rettie mckernan' and 'john buchanan floyd jr.'
Representation of A: [[ 0.8674 -6.1858  4.8685 -1.9147  1.6986  1.8509  1.8745 -1.0356 -0.1129
  -2.0776 -4.3491  1.0919 -0.6496 -0.3684  1.2532  1.2199  1.1634 -1.1604
   0.5337 -0.5276  0.1486 -1.9767  2.5101 -2.5289  0.5651 -1.8487  0.4152
   3.8067  1.7839 -1.9458 -0.4948 -1.1107 -1.1188 -0.4179  4.8243 -2.2412
  -1.6235  1.3415 -1.8619 -3.8958]]
Representation of B: [[ 3.3576 -0.7466  1.5876  3.8162  0.1318  0.1331 -0.2415 -0.6138  0.7842
   3.324  -0.268  -0.3962  0.5021 -1.6907 -0.5371 -0.9184  2.4868 -2.3214
   0.227   3.2165 -0.2596 -2.8404 -0.7695  0.8727 -2.0781 -0.3078  3.0882
  -1.6449  0.4179  3.5207 -0.1412  1.3492  0.6924  0.5781 -2.2259  3.1614
  -0.3006 -1.8464  2.4818 -3.4103]]
Similarity: [0.4449]
Prediction: [[0.2607]]


In [4]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'john rettie mckernan' and 'william prentice cooper'
Representation of A: [[ 0.8674 -6.1858  4.8685 -1.9147  1.6986  1.8509  1.8745 -1.0356 -0.1129
  -2.0776 -4.3491  1.0919 -0.6496 -0.3684  1.2532  1.2199  1.1634 -1.1604
   0.5337 -0.5276  0.1486 -1.9767  2.5101 -2.5289  0.5651 -1.8487  0.4152
   3.8067  1.7839 -1.9458 -0.4948 -1.1107 -1.1188 -0.4179  4.8243 -2.2412
  -1.6235  1.3415 -1.8619 -3.8958]]
Representation of B: [[ 0.3364  4.5002 -0.8029  1.3502  1.7989  0.9568 -0.6449  1.2382  2.9128
   1.252  -1.4092 -4.1453  5.2748  1.8497  0.529   1.979  -0.7897  0.7117
   1.9742 -0.3601 -1.0188  1.6629 -2.446  -0.2847  1.7126 -4.3798  1.4753
  -6.2442 -1.7403  2.4    -2.1982 -1.3289  2.9443 -2.7094 -0.9545  0.4636
   2.2679  2.3682  1.4851  3.533 ]]
Similarity: [0.2982]
Prediction: [[0.1919]]


In [126]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


john rettie mckernan__________
william prentice cooper_______
[15.  8. 11.  5.  2.  6.  3. 13. 13.  9.  3.  2. 12. 16. 22.  3.  6.  5.
  4.  5.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[17.  9.  7.  7.  9.  4. 12.  2. 23.  6.  3.  5. 13.  9. 16.  3.  2. 16.
  8.  8. 23.  3.  6.  0.  0.  0.  0.  0.  0.  0.]


In [7]:
my_test = ["Bill Gates","William Gates"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)

Comparing 'Bill Gates' and 'William Gates'
Representation of A: [[ 0.4596  2.4613 -0.5936  0.6842 -2.9203 -0.0601 -2.0932  1.1284  2.2899
   0.987  -0.4173 -0.3512 -0.1351 -2.347  -0.2694  0.6359  1.8277  0.217
   1.1724  0.0969  0.6374 -1.8048 -2.1177  1.9259 -1.3221 -0.7982 -1.9583
  -0.7548 -0.3978 -0.4425  0.5748  1.8815  0.0237 -0.2836 -1.8876  2.2975
   0.2424 -2.1712  0.4969  0.9343]]
Representation of B: [[ 1.0078  1.8658 -1.2906 -0.1399 -3.0878 -1.5324 -1.8868 -0.5068  0.5574
   1.0118 -0.0263 -0.2501 -1.2824 -1.0783  0.2446  0.1768  0.1873  0.8583
   1.1905  0.0886 -0.1217  0.1591 -1.1312  1.7443  0.5922  0.9161 -1.8421
   1.4797 -0.7518 -0.3403  2.0032  1.6788 -0.8301 -0.4548 -1.3197  1.8719
  -0.5245 -1.7326  0.7616  0.7881]]
Similarity: [0.8624]
Prediction: [[0.5203]]


In [13]:
def check_similarity(string_a, string_b, tokeniser, model, match=None, debug=False):
    input_seq = preprocess_list([string_a,string_b],tokeniser,30)
    input_seq = [x.reshape(1, len(x)) for x in input_seq]
    prediction = model((input_seq[0],input_seq[1]))[0][0]

    if debug:
        print("********************************")
        print(f"Comparing: '{string_a}' and '{string_a}'")
        print(f"Cosine similarity = {model.cosine_similarity[0]:.4f}, prediction={prediction:.4f} true similarity = {match}")
    return np.round(model.cosine_similarity[0],4)

for i in range(50):
    check_similarity(governors_list[i],combination_list[i],tk, model,match[i],True)

# predictions = []
# for name_a, name_b, _match in zip(governors_list, combination_list, match):
#     predictions.append(check_similarity(name_a,name_b,tk,model,_match,False))

# print(predictions)


********************************
Comparing: 'john rettie mckernan' and 'john rettie mckernan'
Cosine similarity = 0.4449, prediction=0.2607 true similarity = 0
********************************
Comparing: 'william j. janklow' and 'william j. janklow'
Cosine similarity = 0.4264, prediction=0.2511 true similarity = 0
********************************
Comparing: 'william henry seward' and 'william henry seward'
Cosine similarity = 0.4757, prediction=0.2769 true similarity = 0
********************************
Comparing: 'stevens thomson mason' and 'stevens thomson mason'
Cosine similarity = 0.9945, prediction=0.6076 true similarity = 1
********************************
Comparing: 'william pinkney whyte' and 'william pinkney whyte'
Cosine similarity = 0.4730, prediction=0.2755 true similarity = 0
********************************
Comparing: 'john larue helm' and 'john larue helm'
Cosine similarity = 0.3576, prediction=0.2180 true similarity = 0
********************************
Comparing: 'willi

In [80]:
set((list(val_data.as_numpy_iterator())[0][-1]))

{0}

In [119]:
combined_data[combined_data.match==0].groupby(["combinations","governor"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,match
combinations,governor,Unnamed: 2_level_1
aaron ogden,aaron thomas bliss,1
aaron ogden,aaron venable brown,1
aaron ogden,david ogden watkins,1
aaron ogden,samuel aaron baker,1
aaron thomas bliss,aaron ogden,1
...,...,...
zenas perry moody,edward alysworth perry,1
zenas perry moody,madison starke perry,1
zenas perry moody,moody currier,1
zenas perry moody,oliver perry morton,1


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,6))
plt.scatter(predictions,match)
plt.xlabel("Predicted similarity")
plt.ylabel("Actual similarity")
plt.show()
