In [2]:
import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [3]:
np.set_printoptions(precision=4)

In [4]:
path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

In [5]:
governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
features = zip(governor_seq,combination_seq)

In [137]:
from dataclasses import dataclass

@dataclass
class InnerModelSettings:
    """
    Dataclass for storring inner model settings
    """
    n_embedding_dims: int
    n_gru: int
    n_dense: int

class InnerModel(tf.keras.Model):
    """
    Inner model to be used inside the outer model.
    It is responsible for transformations of a sequence into a vector representation
    that will be used further for comparisson
    """
    def __init__(self, settings: InnerModelSettings):
        super().__init__()
        self.n_embedding_dims = settings.n_embedding_dims
        self.n_gru = settings.n_gru
        self.n_dense = settings.n_dense

        self.embedding = tf.keras.layers.Embedding(len(tk.word_index)+1,self.n_embedding_dims,name="inner_embedding")
        self.gru = tf.keras.layers.GRU(self.n_gru,name="gru")
        self.bi_gru = tf.keras.layers.Bidirectional(self.gru,name="inner_bidirectional")
        self.dense = tf.keras.layers.Dense(self.n_dense,name="inner_dense")
    
    def call(self, x, training=False):
        #inputs = inputs.reshape(-1,len(inputs))
        #x = tf.reshape(x,shape=(-1,len(x)))
        #x.numpy().reshape(-1,len(x.numpy()))
        x = self.embedding(x,training=training)
        x = self.bi_gru(x,training=training)
        x = self.dense(x,training=training)

        return x

class DistanceLayer(tf.keras.layers.Layer):
    """
    Layer responsible for computation of cosine similarity
    """
    def __init__(self,**kwargs):
        super().__init__(**kwargs)

    def call(self,input_a,input_b):
        dist = ( 1-tf.keras.losses.cosine_similarity(input_a,input_b) ) / 2
        return dist#tf.reshape(dist,shape=(len(dist),-1))

class OuterModel(tf.keras.Model):
    """
    Outer model. It takes two inputs (one sequence for each string compared)
    trahsorms them into a vector representations using inner model
    and comptes the output using the cosine distance layer
    """
    def __init__(self, settings: InnerModelSettings):
        super().__init__()
        #layers
        self.inner_model = InnerModel(settings)
        self.distance_layer = DistanceLayer()
        self.output_layer = tf.keras.layers.Dense(1, activation="sigmoid",name="output_layer")

        #learned representations
        self.repr_a = []
        self.repr_b = []
        self.cosine_similarity = 0

    def call(self,inputs,training=False):
        input_a = inputs[0]
        input_b = inputs[1]
        # print(f"Outer model - call - input_a shape: {input_a.shape}")
        # print(f"Outer model - call - input_b shape: {input_b.shape}")

        self.repr_a = self.inner_model(input_a, training=training)
        self.repr_b = self.inner_model(input_b, training=training)
        # print(f"Outer model - call - repr_a shape: {self.repr_a.shape}")
        # print(f"Outer model - call - repr_b shape: {self.repr_b.shape}")
        self.cosine_similarity = self.distance_layer(self.repr_a,self.repr_b,training=training)

        # print(f"Outer model - call - cosine shape: {self.cosine_similarity.shape}")
        #print(self.cosine_similarity)
        #print(f"Outer model - call - output shape: {self.output_layer(self.cosine_similarity,training=training).shape}")
        
        return self.cosine_similarity #self.output_layer(self.cosine_similarity,training=training)

    def train_step(self, data):
        x,y = data
        # print(f"Outer model - train - x shape: {x}")
        # print(f"Outer model - train - y shape: {y}")

        with tf.GradientTape() as tape:
            #predict the value
            y_pred = self(x, training=True)
            #compute the loss
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        #compute gradients
        trainable_variables = self.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        #update weights
        self.optimizer.apply_gradients(zip(gradients,trainable_variables))
        #update metrics
        self.compiled_metrics.update_state(y, y_pred)
        return {m.name: m.result() for m in self.metrics}

    # def fit(self,train_data,**kwargs):
    #     print(train_data)
    #     super().fit(train_data,**kwargs)

In [68]:
print(to_categorical(match).shape)
print(to_categorical(match).reshape(len(match),-1).shape)


(307377, 2)
(307377, 2)


In [110]:
match_seq = np.array(match)
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(buffer_size=50).batch(200)

train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

x_train = train_data.map(lambda feature, outcome: feature)
y_train = train_data.map(lambda feature, outcome: outcome)

x_val = val_data.map(lambda feature, outcome: feature)
y_val = val_data.map(lambda feature, outcome: outcome)

x_test = test_data.map(lambda feature, outcome: feature)
y_test = test_data.map(lambda feature, outcome: outcome)

In [140]:
inner_model_settings = InnerModelSettings(
    n_embedding_dims = 64, #len(tk.word_index),
    n_gru = 10,
    n_dense = 10
)

model = OuterModel(inner_model_settings)

model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy'],
#    run_eagerly=True
)

model.fit(
    train_data,
    batch_size = 1000,
    epochs = 50,
    validation_data = val_data,
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 

In [75]:
class ShapeChecker():
  def __init__(self):
    # Keep a cache of every axis-name seen
    self.shapes = {}

  def __call__(self, tensor, names, broadcast=False):
    if not tf.executing_eagerly():
      return

    if isinstance(names, str):
      names = (names,)

    shape = tf.shape(tensor)
    rank = tf.rank(tensor)

    if rank != len(names):
      raise ValueError(f'Rank mismatch:\n'
                       f'    found {rank}: {shape.numpy()}\n'
                       f'    expected {len(names)}: {names}\n')

    for i, name in enumerate(names):
      if isinstance(name, int):
        old_dim = name
      else:
        old_dim = self.shapes.get(name, None)
      new_dim = shape[i]

      if (broadcast and new_dim == 1):
        continue

      if old_dim is None:
        # If the axis name is new, add its length to the cache.
        self.shapes[name] = new_dim
        continue

      if new_dim != old_dim:
        raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")

In [188]:
def compare_representations(input_a, input_b, debug=False):
    inner_model_settings = InnerModelSettings(
        n_embedding_dims = len(tk.word_index),
        n_gru = 10,
        n_dense = 10
    )
    outer_model = OuterModel(inner_model_settings)
    outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")

    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

governor_input = governor_seq.reshape((1,)+governor_seq.shape)
combination_input = combination_seq.reshape((1,) +combination_seq.shape)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")

similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    True
)


Comparing 'a.p. lutali' and 'a.p. lutali'
Representation of A: [[-0.0379  0.0017  0.0177 -0.0266  0.0065  0.0315 -0.0477 -0.0332 -0.0195
   0.0385]]
Representation of B: [[-0.0379  0.0017  0.0177 -0.0266  0.0065  0.0315 -0.0477 -0.0332 -0.0195
   0.0385]]
Similarity: [1.]


In [215]:
import random
from itertools import filterfalse
random.seed(20211808)

def form_data(input_a,input_b,match, ind):
    return zip(input_a[ind],input_b[ind]), match[ind]

def prepare_train_test_data(governor_seq,combination_seq,match,ratio):
    if len(governor_seq) != len(combination_seq):
        return None
    n_records = len(governor_seq)
    train_indices = random.sample(range(0,n_records),int(float(n_records) * ratio))
    print(f"selected {len(train_indices)} for training")
    test_indices = list(filterfalse(train_indices.__contains__, list(range(n_records))))
    [x for x in range(n_records) if x not in train_indices]
    print(f"selected {len(test_indices)} for testing")

    x_train, y_train = form_data(governor_seq,combination_seq,match,train_indices)
    x_test, y_test = form_data(governor_seq,combination_seq,match,test_indices)
    
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = prepare_train_test_data(governor_seq,combination_seq,match,.6)

In [None]:
inner_model_settings = InnerModelSettings(
    n_embedding_dims = len(tk.word_index),
    n_gru = 10,
    n_dense = 10
)

model_1 = OuterModel(inner_model_settings)
model_1.compile()

In [169]:
print(governor_seq.shape)
print(governor_seq.reshape((1,)+governor_seq.shape).shape)

(307377, 11)
(1, 307377, 11)


In [122]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


a.p. lutali
lutali a.p.
[ 4. 25. 23. 25.  2.  7. 20. 13.  4.  7.  9.]
[ 7. 20. 13.  4.  7.  9.  2.  4. 25. 23. 25.]


In [126]:
def count_elements(lst):
    unique_elements = sorted(set(lst))
    return_dict = {}
    for el in unique_elements:
        return_dict.update({int(el):len(lst[lst==el])})

    return return_dict

print(count_elements(governor_seq[0]))
print(count_elements(combination_seq[0]))
print(count_elements(combination_seq[1]))


{2: 1, 4: 2, 7: 2, 9: 1, 13: 1, 20: 1, 23: 1, 25: 2}
{2: 1, 4: 2, 7: 2, 9: 1, 13: 1, 20: 1, 23: 1, 25: 2}
{2: 1, 4: 2, 7: 2, 9: 1, 13: 1, 20: 1, 23: 1, 25: 2}


In [84]:
t1 = [[1, 2, 3],
      [4, 5, 6]]
t2 = tf.reshape(t1, [6])
print(tf.reshape(t2,shape=(-1,len(t2))))

tf.Tensor([[1 2 3 4 5 6]], shape=(1, 6), dtype=int32)
