In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from inner_model_settings import InnerModelSettings
from char_level_rnn_with_attention import OuterModel

np.set_printoptions(precision=4)

path = "data\\combinations\\"
true_data = pd.read_csv(path+"governors_true_match.csv",sep=";")
false_data = pd.read_csv(path+"governors_false_match.csv",sep=";")
combined_data = pd.concat([true_data,false_data])
names = sorted(set(list(combined_data.governor) + list(combined_data.combinations)))
words = sorted(set(word for name in list(map(str.split,names)) for word in name))
vocab = sorted(set(character for word in words for character in word))

governors_list = list(combined_data.governor)
combination_list = list(combined_data.combinations)
match = list(combined_data.match)

tk = Tokenizer(num_words=None, char_level=True, oov_token="UNK")
tk.fit_on_texts(governors_list+combination_list)

def preprocess_list(lst,tokenizer,max_len=None):
    return_seq = tokenizer.texts_to_sequences(lst)
    return np.array(pad_sequences(return_seq, maxlen=max_len,padding="post"),dtype="float32")

governor_seq = preprocess_list(governors_list,tk,30)
combination_seq = preprocess_list(combination_list,tk,30)
features = zip(governor_seq,combination_seq)

match_seq = np.array(match) #to_categorical(np.array(match))
data = tf.data.Dataset.from_tensor_slices(((governor_seq,combination_seq),match_seq)).shuffle(buffer_size=20).batch(1000)

train_ratio = .6
val_ratio = .2
test_ratio = .2

train_batches = int(len(data) * train_ratio)
val_batches = int(len(data) * val_ratio)
test_batches = int(len(data) * test_ratio)

train_data = data.take(train_batches)
test_data = data.skip(train_batches)
val_data = test_data.take(val_batches)
test_data = test_data.skip(test_batches)

settings = InnerModelSettings(
    input_embedding=129,
    n_embedding_dims = 64,
    n_gru = 20,
    n_dense = 40,
    n_units_attention=10
)

model = OuterModel(settings)

model.compile(
    loss= tf.losses.BinaryCrossentropy(), #  contrastive_loss#tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy'],
)

model.fit(
    train_data,
    batch_size = 1000,
    epochs = 20,
    validation_data = val_data,
    verbose=1
)


129
Epoch 1/20


ValueError: in user code:

    C:\Users\grego\miniconda3\envs\nlp_tensor\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    c:\Users\grego\Documents\kaggle\pep_names\char_level_rnn_with_attention.py:115 call  *
        return self.output_layer((self.cosine_similarity))
    C:\Users\grego\miniconda3\envs\nlp_tensor\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1013 __call__  **
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\grego\miniconda3\envs\nlp_tensor\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:230 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer output_layer is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: (None,)


In [None]:
model.summary()

In [5]:
def compare_representations(input_a, input_b, model, debug=False):
    outer_model = model
    outer_model((input_a.reshape(-1,len(input_a)),input_b.reshape(-1,len(input_b))))

    if debug:
        print(f"Representation of A: {outer_model.repr_a}")
        print(f"Representation of B: {outer_model.repr_b}")
        print(f"Similarity: {outer_model.cosine_similarity}")

    return outer_model.cosine_similarity, (outer_model.repr_a,outer_model.repr_a)

print(f"Comparing '{governors_list[0]}' and '{combination_list[0]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[0],
    model,
    True
)


Comparing 'a.p. lutali' and 'a.p. lutali'
Representation of A: [[ 1.2841e-02 -2.4019e-03 -2.4513e-02 -1.1199e-02  3.8881e-02 -2.6877e-02
  -8.0941e-03  3.2985e-02  2.0193e-03  1.6986e-02  9.4008e-03  4.7632e-02
  -1.6909e-02 -3.1705e-03 -4.7794e-02 -1.8177e-02  5.0319e-02 -3.9953e-02
  -2.3737e-05 -1.8465e-02 -2.8401e-02  2.7159e-02 -3.5942e-04 -8.5991e-03
   2.6769e-02 -2.5960e-02  2.1016e-02  3.5296e-02  9.6852e-03 -1.2874e-02
  -1.9340e-02  6.1357e-03 -1.3559e-02  1.8496e-02 -2.7873e-02  1.3812e-02
  -1.1559e-02  9.0414e-03 -1.6086e-02  9.6087e-03]]
Representation of B: [[ 1.2841e-02 -2.4019e-03 -2.4513e-02 -1.1199e-02  3.8881e-02 -2.6877e-02
  -8.0941e-03  3.2985e-02  2.0193e-03  1.6986e-02  9.4008e-03  4.7632e-02
  -1.6909e-02 -3.1705e-03 -4.7794e-02 -1.8177e-02  5.0319e-02 -3.9953e-02
  -2.3737e-05 -1.8465e-02 -2.8401e-02  2.7159e-02 -3.5942e-04 -8.5991e-03
   2.6769e-02 -2.5960e-02  2.1016e-02  3.5296e-02  9.6852e-03 -1.2874e-02
  -1.9340e-02  6.1357e-03 -1.3559e-02  1.8496e-02 

In [6]:

print(f"Comparing '{governors_list[0]}' and '{combination_list[1]}'")
similarity, representations = compare_representations(
    governor_seq[0],
    combination_seq[1],
    model,
    True
)


Comparing 'a.p. lutali' and 'lutali a.p.'
Representation of A: [[ 1.2841e-02 -2.4019e-03 -2.4513e-02 -1.1199e-02  3.8881e-02 -2.6877e-02
  -8.0941e-03  3.2985e-02  2.0193e-03  1.6986e-02  9.4008e-03  4.7632e-02
  -1.6909e-02 -3.1705e-03 -4.7794e-02 -1.8177e-02  5.0319e-02 -3.9953e-02
  -2.3737e-05 -1.8465e-02 -2.8401e-02  2.7159e-02 -3.5942e-04 -8.5991e-03
   2.6769e-02 -2.5960e-02  2.1016e-02  3.5296e-02  9.6852e-03 -1.2874e-02
  -1.9340e-02  6.1357e-03 -1.3559e-02  1.8496e-02 -2.7873e-02  1.3812e-02
  -1.1559e-02  9.0414e-03 -1.6086e-02  9.6087e-03]]
Representation of B: [[ 0.0082 -0.0043 -0.0215 -0.0178  0.0327 -0.026  -0.014   0.0338  0.0093
   0.0226  0.013   0.0527 -0.0151 -0.0016 -0.058  -0.0141  0.0525 -0.0451
  -0.0046 -0.0229 -0.0379  0.0278 -0.0075 -0.0081  0.0247 -0.0255  0.0191
   0.0381  0.0112 -0.016  -0.0185  0.0145 -0.0131  0.0126 -0.0236  0.019
  -0.0101  0.0144 -0.007   0.0106]]
Similarity: [0.9912]


In [None]:
def text_from_ids(ids,dict=tk.word_index):
    inv_dict = {v: k for k,v in tk.word_index.items()}
    char_list = []
    for id in ids:
        if id not in inv_dict:
            char = "_"
        else:
            char = inv_dict[id]
        char_list.append(char)

    return("".join(char_list))

print(text_from_ids(governor_seq[0]))
print(text_from_ids(combination_seq[1]))

print(governor_seq[0])
print(combination_seq[1])


In [7]:
my_test = ["Gregory Sharkov","Cheikh Hadrami"]
my_test_seq = preprocess_list(my_test, tk)

print(f"Comparing '{my_test[0]}' and '{my_test[1]}'")
similarity, representations = compare_representations(
    my_test_seq[0],
    my_test_seq[1],
    model,
    True
)


Comparing 'Gregory Sharkov' and 'Cheikh Hadrami'
Representation of A: [[ 0.0319 -0.0179  0.0348 -0.0226 -0.0409  0.018   0.0245  0.0276 -0.008
  -0.006  -0.0243 -0.0534 -0.0291 -0.0357  0.0613  0.0364 -0.0151  0.0339
  -0.025   0.0356 -0.007   0.001   0.0159  0.0181 -0.0327  0.0021  0.0175
  -0.0343  0.0308  0.0594  0.0176  0.0212 -0.0179  0.0305  0.0324 -0.0587
   0.0622 -0.0045 -0.0195 -0.0069]]
Representation of B: [[ 0.0396  0.0061  0.0363 -0.0475 -0.0662  0.0194  0.0015  0.0258  0.0328
  -0.0207 -0.0546 -0.0041 -0.0308 -0.0291  0.0073  0.0449  0.0222  0.0066
   0.0103 -0.0049 -0.0494  0.0081 -0.0286 -0.013  -0.0193 -0.0268  0.0269
  -0.0046  0.0199  0.0494  0.0304  0.0132 -0.0292 -0.0105  0.0225 -0.0246
   0.0263  0.0454  0.0591 -0.0156]]
Similarity: [0.7641]
