In [1]:
import os
import datetime as datetime
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
combined_data = combined_data.sample(frac=1,random_state=20210826)
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
match_seq = np.array(match)

#let's crate the training dataset and do the splits
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(10).batch(1000)
train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)


In [2]:
from model_settings import InnerModelSettings, OuterModelSettings, FitSettings
from char_level_rnn_with_attention import OuterModel

def create_model(inner_settings:InnerModelSettings,outer_settings:OuterModelSettings):
    model = OuterModel(inner_settings)
    
    model.compile(
        loss= outer_settings.loss, 
        optimizer=outer_settings.optimizer,
        metrics=outer_settings.metrics,
    )

    return model

def fit_model(model: tf.keras.Model,
              train_data: tf.data.Dataset,
              val_data: tf.data.Dataset,
              fit_settings: FitSettings,
              print_summary:bool=False):

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    print(f"logs will be saved to: {log_dir}")
    checkpoint_path = log_dir + "/weights/cp-{epoch:02d}.ckpt"

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, 
        histogram_freq=1
    )

    checkpoints_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath = checkpoint_path,
        verboise=1,
        save_weights_only=True,
        save_best_only=True
    )

    fit_settings.callbacks = [tensorboard_callback,checkpoints_callback]

    model.fit(
        train_data,
        batch_size = fit_settings.batch_size,
        epochs = fit_settings.epochs,
        validation_data = val_data,
        verbose=fit_settings.verbose,
        callbacks=fit_settings.callbacks
    )
    
    if print_summary:
        print(model.summary())
        
    return model

def compare_representations(input_a, input_b, model, debug=False,give_representations=False):
    outer_model = model
    prediction = outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        if give_representations:
            print(f"Representation of A: {outer_model.repr_a}")
            print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity[0]:.4f}")
        print(f"Prediction: {prediction[0][0]:.4f} => {np.round(prediction[0][0],0)}")


    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

# Now, let's train the base model


In [4]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

inner_settings_1 = InnerModelSettings(
    input_embedding = 129,
    n_embedding_dims = 512,
    n_gru = 40,
    n_dense = 80,
    n_units_attention=40
)

outer_settings_1 = OuterModelSettings(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(1e-4),
    metrics = [tf.keras.metrics.BinaryAccuracy(name="accuracy"),
               tf.keras.metrics.Precision(name="precision")]
)

fit_settings_1 = FitSettings(
    batch_size = 1000,
    epochs = 10,
    verbose=2,
    callbacks=[]
)

model_1 = create_model(inner_settings_1,outer_settings_1)
model_1 = fit_model(model_1,train_data,val_data,fit_settings_1,True)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
logs will be saved to: logs/fit/20210906-132920
Epoch 1/10
184/184 - 55s - loss: 0.6854 - accuracy: 0.3820 - precision: 0.3623 - val_loss: 0.6876 - val_accuracy: 0.3694 - val_precision: 0.3553
Epoch 2/10
184/184 - 26s - loss: 0.6697 - accuracy: 0.4416 - precision: 0.3862 - val_loss: 0.6678 - val_accuracy: 0.4337 - val_precision: 0.3802
Epoch 3/10
184/184 - 26s - loss: 0.6605 - accuracy: 0.4999 - precision: 0.4127 - val_loss: 0.6550 - val_accuracy: 0.5035 - val_precision: 0.4116
Epoch 4/10
184/184 - 26s - loss: 0.6531 - accuracy: 0.5530 - precision: 0.4402 - val_loss: 0.6476 - val_accuracy: 0.5612 - val_precision: 0.4418
Epoch 5/10
184/184 - 26s - loss: 0.6463 - accuracy: 0.6088 - precision: 0.4733 - val_loss: 0.6405 - val_accuracy: 0.6204 - val_precision: 0.4779
Epoch 6/10
184/184 - 26s - loss: 0.6400 - accuracy: 0.6580 - precision: 0.5071 - val_loss: 0.6343 - val_accuracy: 0.6741 - val_precision:

In [5]:
model_1.inner_model.summary()

Model: "inner_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inner_embedding (Embedding)  multiple                  66560     
_________________________________________________________________
inner_dense_after_embedding  multiple                  0 (unused)
_________________________________________________________________
dropout_1 (Dropout)          multiple                  0         
_________________________________________________________________
inner_gru_1 (GRU)            multiple                  0 (unused)
_________________________________________________________________
inner_gru_2 (GRU)            multiple                  0 (unused)
_________________________________________________________________
attention_1 (Attention)      multiple                  4921      
_________________________________________________________________
inner_bidirectional_1 (Bidir multiple                

In [6]:
print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model_1,
    True
)


Comparing 'john rettie mckernan' and 'john buchanan floyd jr.'
Similarity: 0.1200
Prediction: 0.4770 => 0.0


In [7]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model_1,
    True
)


Comparing 'john rettie mckernan' and 'william prentice cooper'
Similarity: 0.1764
Prediction: 0.4876 => 0.0


In [8]:
my_test = ["Bill Gates","Gates William"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model_1,
    True
)

Comparing 'Bill Gates' and 'Gates William'
Similarity: 0.6044
Prediction: 0.5675 => 1.0


In [10]:
def check_similarity(string_a, string_b, tokeniser, model, match=None, debug=False):
    input_seq = preprocess_list([string_a,string_b],tokeniser,30)
    input_seq = [x.reshape(1, len(x)) for x in input_seq]
    prediction = model((input_seq[0],input_seq[1]))[0][0]

    if debug:
        print("********************************")
        print(f"Comparing: '{string_a}' and '{string_b}'")
        print(f"Cosine similarity = {model.cosine_similarity[0]:.4f}, prediction={prediction:.4f} true similarity = {match}")
    return np.round(model.cosine_similarity[0],4)

for i in range(50):
    check_similarity(governors_list[i],combination_list[i],tk, model_1,match[i],True)

# predictions = []
# for name_a, name_b, _match in zip(governors_list, combination_list, match):
#     predictions.append(check_similarity(name_a,name_b,tk,model,_match,False))

# print(predictions)


********************************
Comparing: 'john rettie mckernan' and 'john buchanan floyd jr.'
Cosine similarity = 0.1200, prediction=0.4770 true similarity = 0
********************************
Comparing: 'william j. janklow' and 'william prentice cooper'
Cosine similarity = 0.1534, prediction=0.4833 true similarity = 0
********************************
Comparing: 'william henry seward' and 'william grant stratton'
Cosine similarity = 0.2526, prediction=0.5019 true similarity = 0
********************************
Comparing: 'stevens thomson mason' and 'tompson stevens mason'
Cosine similarity = 0.6984, prediction=0.5847 true similarity = 1
********************************
Comparing: 'william pinkney whyte' and 'william john bulow'
Cosine similarity = 0.0390, prediction=0.4619 true similarity = 0
********************************
Comparing: 'john larue helm' and 'john price buchanan'
Cosine similarity = 0.0119, prediction=0.4569 true similarity = 0
********************************
Compar

In [None]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])

# Now let's work with restoring the model

In [14]:
latest_chkpt = tf.train.latest_checkpoint("logs\\fit\\20210906-132920\\weights\\")

model_2 = create_model(inner_settings_1,outer_settings_1)
model_2.load_weights(latest_chkpt)

similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model_2,
    True
)

Similarity: 0.6044
Prediction: 0.5675 => 1.0


In [15]:
model_2 = fit_model(model_2,train_data,val_data,fit_settings_1,True)

logs will be saved to: logs/fit/20210906-134539
Epoch 1/10
184/184 - 46s - loss: 0.6121 - accuracy: 0.8233 - precision: 0.6671 - val_loss: 0.6063 - val_accuracy: 0.8330 - val_precision: 0.6782
Epoch 2/10
184/184 - 26s - loss: 0.6068 - accuracy: 0.8453 - precision: 0.6975 - val_loss: 0.6011 - val_accuracy: 0.8485 - val_precision: 0.6998
Epoch 3/10
184/184 - 26s - loss: 0.6017 - accuracy: 0.8599 - precision: 0.7194 - val_loss: 0.5956 - val_accuracy: 0.8610 - val_precision: 0.7187
Epoch 4/10
184/184 - 26s - loss: 0.5964 - accuracy: 0.8732 - precision: 0.7405 - val_loss: 0.5904 - val_accuracy: 0.8721 - val_precision: 0.7364
Epoch 5/10
184/184 - 26s - loss: 0.5914 - accuracy: 0.8823 - precision: 0.7559 - val_loss: 0.5851 - val_accuracy: 0.8829 - val_precision: 0.7539
Epoch 6/10
184/184 - 26s - loss: 0.5861 - accuracy: 0.8922 - precision: 0.7734 - val_loss: 0.5796 - val_accuracy: 0.8918 - val_precision: 0.7701
Epoch 7/10
184/184 - 26s - loss: 0.5809 - accuracy: 0.8997 - precision: 0.7873 - v

In [16]:
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model_2,
    True
)

Similarity: 0.6409
Prediction: 0.5642 => 1.0


In [17]:
for i in range(50):
    check_similarity(governors_list[i],combination_list[i],tk, model_2,match[i],True)

********************************
Comparing: 'john rettie mckernan' and 'john buchanan floyd jr.'
Cosine similarity = 0.0821, prediction=0.4312 true similarity = 0
********************************
Comparing: 'william j. janklow' and 'william prentice cooper'
Cosine similarity = 0.0818, prediction=0.4311 true similarity = 0
********************************
Comparing: 'william henry seward' and 'william grant stratton'
Cosine similarity = 0.4644, prediction=0.5223 true similarity = 0
********************************
Comparing: 'stevens thomson mason' and 'tompson stevens mason'
Cosine similarity = 0.8761, prediction=0.6187 true similarity = 1
********************************
Comparing: 'william pinkney whyte' and 'william john bulow'
Cosine similarity = 0.0069, prediction=0.4136 true similarity = 0
********************************
Comparing: 'john larue helm' and 'john price buchanan'
Cosine similarity = 0.0197, prediction=0.4166 true similarity = 0
********************************
Compar

In [20]:
my_test = ["Grigory Sharkov","Sharkov Gregory"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model_2,
    True
)

Comparing 'Grigory Sharkov' and 'Sharkov Gregory'
Similarity: 0.9335
Prediction: 0.6315 => 1.0
