In [1]:
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

# Load the TensorBoard notebook extension
%load_ext tensorboard

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
combined_data = combined_data.sample(frac=1,random_state=20210826)
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
#features = zip(governor_seq,combination_seq)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(10).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 256,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=20
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy'],
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(
    train_data,
    batch_size = 200,
    epochs = 15,
    validation_data = val_data,
    verbose=1,
    callbacks=[tensorboard_callback]
)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x17e0533e250>

In [6]:
model.summary()

Model: "outer_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_model (InnerModel)     multiple                  239445    
_________________________________________________________________
distance_layer (DistanceLaye multiple                  0         
_________________________________________________________________
output_layer (Dense)         multiple                  2         
Total params: 239,447
Trainable params: 239,367
Non-trainable params: 80
_________________________________________________________________


In [9]:

model.save_weights("logs\\fit\\20210827-092654\weights\\")

In [2]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    prediction = outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")
        print(f"Prediction: {prediction}")


    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'john rettie mckernan' and 'john buchanan floyd jr.'
Representation of A: [[ 1.3113 -3.1576 -2.6412  4.9379 -0.8792 -3.501   4.5565  5.1792  0.2441
  -3.719  -5.2579  4.2294  3.8527  4.9473 -1.5105  1.423  -6.1416 -0.9078
  -0.0907 -1.2577  3.4145 -5.8586 -4.3187  4.8942 -0.961   3.506  -4.3536
   0.0477  1.8543  4.9001 -4.2877 -1.0443 -2.4858  2.3508  5.356  -0.1727
  -1.1499 -0.8238 -2.5744 -6.1082]]
Representation of B: [[ 1.2868 -3.1029 -2.6098  4.8804 -0.8795 -3.4701  4.511   5.1047  0.257
  -3.6564 -5.1857  4.1685  3.796   4.8722 -1.5052  1.4165 -6.0709 -0.8931
  -0.1294 -1.2394  3.386  -5.776  -4.2863  4.8321 -0.9382  3.4597 -4.2843
   0.0644  1.8558  4.8357 -4.2032 -1.0389 -2.4469  2.3051  5.2927 -0.1739
  -1.1233 -0.7923 -2.5257 -6.0225]]
Similarity: [1.]
Prediction: [[0.0821]]


In [3]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'john rettie mckernan' and 'william prentice cooper'
Representation of A: [[ 1.3113 -3.1576 -2.6412  4.9379 -0.8792 -3.501   4.5565  5.1792  0.2441
  -3.719  -5.2579  4.2294  3.8527  4.9473 -1.5105  1.423  -6.1416 -0.9078
  -0.0907 -1.2577  3.4145 -5.8586 -4.3187  4.8942 -0.961   3.506  -4.3536
   0.0477  1.8543  4.9001 -4.2877 -1.0443 -2.4858  2.3508  5.356  -0.1727
  -1.1499 -0.8238 -2.5744 -6.1082]]
Representation of B: [[ 1.2323 -2.9138 -2.4411  4.6012 -0.812  -3.2063  4.164   4.7444  0.1703
  -3.377  -4.7898  3.9085  3.4637  4.5014 -1.434   1.2176 -5.6309 -0.846
  -0.0888 -1.1189  3.1254 -5.361  -4.0354  4.4768 -0.8554  3.2395 -3.9689
   0.0305  1.6913  4.5265 -3.877  -1.0376 -2.3559  2.1603  4.9619 -0.2654
  -1.062  -0.7276 -2.3057 -5.6336]]
Similarity: [1.]
Prediction: [[0.0821]]


In [126]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


john rettie mckernan__________
william prentice cooper_______
[15.  8. 11.  5.  2.  6.  3. 13. 13.  9.  3.  2. 12. 16. 22.  3.  6.  5.
  4.  5.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[17.  9.  7.  7.  9.  4. 12.  2. 23.  6.  3.  5. 13.  9. 16.  3.  2. 16.
  8.  8. 23.  3.  6.  0.  0.  0.  0.  0.  0.  0.]


In [4]:
my_test = ["Grigory Sharkov","Sharkov Gregory"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)

Comparing 'Grigory Sharkov' and 'Sharkov Gregory'
Representation of A: [[-0.3085  0.9531  0.7189 -1.2293  0.418   1.0615 -1.1996 -1.0072  0.0978
   0.791   0.9324 -1.0137 -0.6765 -1.1955  0.4227 -0.1171  1.3401  0.2393
   0.0404  0.5425 -0.8156  1.2556  1.1466 -1.0558  0.146  -0.8501  1.0749
   0.0824 -0.5171 -1.1701  0.8718  0.2755  0.5948 -0.4925 -1.2097  0.251
   0.1249  0.0709  0.5486  1.4649]]
Representation of B: [[-0.4535  1.1885  0.8246 -1.4055  0.0817  1.0347 -1.3716 -1.3508  0.0309
   0.998   1.3581 -1.3328 -1.03   -1.5905  0.3519 -0.395   1.6694  0.2542
   0.1125  0.5539 -0.988   1.7214  1.1846 -1.4005  0.1991 -0.9717  1.3806
   0.3149 -0.5788 -1.5331  1.3518  0.3213  0.6312 -0.6265 -1.5539 -0.0983
   0.3006  0.1985  0.8628  1.7588]]
Similarity: [0.994]
Prediction: [[0.0839]]


In [5]:
def check_similarity(string_a, string_b, tokeniser, model, match=None, debug=False):
    input_seq = preprocess_list([string_a,string_b],tokeniser,30)
    input_seq = [x.reshape(1, len(x)) for x in input_seq]
    prediction = model((input_seq[0],input_seq[1]))[0][0]

    if debug:
        print("********************************")
        print(f"Comparing: '{string_a}' and '{string_b}'")
        print(f"Cosine similarity = {model.cosine_similarity[0]:.4f}, prediction={prediction:.4f} true similarity = {match}")
    return np.round(model.cosine_similarity[0],4)

for i in range(50):
    check_similarity(governors_list[i],combination_list[i],tk, model,match[i],True)

# predictions = []
# for name_a, name_b, _match in zip(governors_list, combination_list, match):
#     predictions.append(check_similarity(name_a,name_b,tk,model,_match,False))

# print(predictions)


********************************
Comparing: 'john rettie mckernan' and 'john buchanan floyd jr.'
Cosine similarity = 1.0000, prediction=0.0821 true similarity = 0
********************************
Comparing: 'william j. janklow' and 'william prentice cooper'
Cosine similarity = 1.0000, prediction=0.0821 true similarity = 0
********************************
Comparing: 'william henry seward' and 'william grant stratton'
Cosine similarity = 1.0000, prediction=0.0821 true similarity = 0
********************************
Comparing: 'stevens thomson mason' and 'tompson stevens mason'
Cosine similarity = 0.9983, prediction=0.0826 true similarity = 1
********************************
Comparing: 'william pinkney whyte' and 'william john bulow'
Cosine similarity = 1.0000, prediction=0.0821 true similarity = 0
********************************
Comparing: 'john larue helm' and 'john price buchanan'
Cosine similarity = 1.0000, prediction=0.0821 true similarity = 0
********************************
Compar

In [80]:
set((list(val_data.as_numpy_iterator())[0][-1]))

{0}

In [119]:
combined_data[combined_data.match==0].groupby(["combinations","governor"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,match
combinations,governor,Unnamed: 2_level_1
aaron ogden,aaron thomas bliss,1
aaron ogden,aaron venable brown,1
aaron ogden,david ogden watkins,1
aaron ogden,samuel aaron baker,1
aaron thomas bliss,aaron ogden,1
...,...,...
zenas perry moody,edward alysworth perry,1
zenas perry moody,madison starke perry,1
zenas perry moody,moody currier,1
zenas perry moody,oliver perry morton,1


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,6))
plt.scatter(predictions,match)
plt.xlabel("Predicted similarity")
plt.ylabel("Actual similarity")
plt.show()
