In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from preprocessor import Preprocessor
import random
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from itertools import combinations

In [38]:
train_preprocessor = Preprocessor()

raw_data = pd.read_csv("../../data/wikipedia.csv")
# clean out where text is NaN
raw_data = raw_data[raw_data.text.notna()]

In [39]:
x = []
y = []
for row in raw_data.text:
    # get words TODO only relevant words
    words = train_preprocessor.normalize_words(row)

    for (a, b) in combinations(words,2):
        x.append(a)
        y.append(b)

In [40]:
len(x)

173887

In [41]:
features = tf.constant(x, shape=(1,len(x),300))

labels = tf.constant(y, shape=(1,len(y),300))

#train_dataset = tf.data.Dataset.from_tensor_slices((features,labels))
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)

train_dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))

In [42]:
train_dataset

<ZipDataset shapes: ((173887, 300), (173887, 300)), types: (tf.float32, tf.float32)>

In [43]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(100, activation=tf.nn.relu, input_shape=(300,)),
  tf.keras.layers.Dense(100, activation=tf.nn.relu),
  tf.keras.layers.Dense(300)
])

In [44]:
loss_object = tf.keras.losses.CosineSimilarity(axis=1)

# def get_random_result(label, labels, predictions, predictions_cache):
#   label = label[0]
#   # TODO: Check if the index of the x value we are testing is not exactly the same as the one we are returning as y
#   if(label not in predictions_cache):
#     predictions_cache[label] = []
#     for idx2, value in enumerate(labels.numpy()):
#         if(value == label):
#             predictions_cache[label].append(predictions[idx2])
#   return predictions_cache[label][random.randint(0,len(predictions_cache[label])-1)]

# def get_new_y(predictions, labels):
#   predictions_cache = {}
#   new_y = []
#   for idx, val in enumerate(predictions.numpy()):
#       label = labels[idx].numpy()
#       new_y.append(get_random_result(label, labels, predictions, predictions_cache))
#   return new_y

def loss(model, x, y, training):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(x, training=training)
  #new_y = get_new_y(y_, y)

  return loss_object(y_true=y, y_pred=y_)

def grad(model, inputs, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, inputs, targets, training=True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [45]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

In [46]:
## Note: Rerunning this cell uses the same model variables

# Keep results for plotting
train_loss_results = []
train_accuracy_results = []
num_epochs = 10
import time
for epoch in range(num_epochs):
  start = time.time()
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.MeanSquaredError()

  # Training loop - using batches of 32
  for x, y in train_dataset:
    # Optimize the model
    loss_value, grads = grad(model, x, y)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss
    # Compare predicted label to actual label
    # training=True is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(x, training=True)
    #new_y = get_new_y(predictions, y)
    epoch_accuracy.update_state(y, predictions)

  # End epoch
  train_loss_results.append(epoch_loss_avg.result())
  train_accuracy_results.append(epoch_accuracy.result())
  end = time.time()
  print("Epoch {:03d}: Loss: {:.3f}, MAE: {:.3}, time: {:.8f}".format(epoch,epoch_loss_avg.result(),epoch_accuracy.result(), end - start))

Epoch 000: Loss: 0.026, MAE: 16.1, time: 1.08056688
Epoch 001: Loss: -0.402, MAE: 16.1, time: 1.16111970
Epoch 002: Loss: -0.402, MAE: 16.1, time: 1.10844302
Epoch 003: Loss: -0.402, MAE: 16.1, time: 1.10486031
Epoch 004: Loss: -0.402, MAE: 16.1, time: 1.15902019
Epoch 005: Loss: -0.402, MAE: 16.1, time: 1.11040854
Epoch 006: Loss: -0.402, MAE: 16.1, time: 1.11214757
Epoch 007: Loss: -0.402, MAE: 16.1, time: 1.12787151
Epoch 008: Loss: -0.402, MAE: 16.1, time: 1.11399674
Epoch 009: Loss: -0.402, MAE: 16.1, time: 1.11921167
Epoch 010: Loss: -0.402, MAE: 16.1, time: 1.16570091
Epoch 011: Loss: -0.402, MAE: 16.1, time: 1.12133241
Epoch 012: Loss: -0.402, MAE: 16.1, time: 1.16167116
Epoch 013: Loss: -0.402, MAE: 16.1, time: 1.11629415
Epoch 014: Loss: -0.402, MAE: 16.1, time: 1.15332174
Epoch 015: Loss: -0.402, MAE: 16.1, time: 1.11204314
Epoch 016: Loss: -0.402, MAE: 16.1, time: 1.16344309
Epoch 017: Loss: -0.402, MAE: 16.1, time: 1.10709715
Epoch 018: Loss: -0.402, MAE: 16.1, time: 1.150

KeyboardInterrupt: 

In [11]:
a = train_preprocessor.vectorize("Brot")
b = train_preprocessor.vectorize("häkeln")
c = train_preprocessor.vectorize("Mehl")
d = train_preprocessor.vectorize("Luftmasche")

trained = model(tf.constant([a,b,c,d]), training=False)
diff1 = np.linalg.norm(trained[0]-trained[2])
diff2 = np.linalg.norm(trained[1]-trained[3])
diff3 = np.linalg.norm(trained[0]-trained[3])
diff4 = np.linalg.norm(trained[1]-trained[2])
print("Should be low")
print(diff1)
print(diff2)
print("Should be higher")
print(diff3)
print(diff4)

diff1 = np.linalg.norm(a-c)
diff2 = np.linalg.norm(b-d)
diff3 = np.linalg.norm(a-d)
diff4 = np.linalg.norm(b-c)
print("Should be low")
print(diff1)
print(diff2)
print("Should be higher")
print(diff3)
print(diff4)

Should be low
7.693727
8.411027
Should be higher
10.316896
7.7869754
Should be low
31.130438
32.286346
Should be higher
43.48296
46.15848


In [28]:
import importlib
import preprocessor
importlib.reload(preprocessor)
train_preprocessor = preprocessor.Preprocessor()
def test(text):
    word_vectors = train_preprocessor.normalize_words(text)
    predicted_vectors = model(tf.constant(word_vectors), training=False)
    return np.average(predicted_vectors, axis=0)

# a = "Brot (ahd. prôt, von urgerm. *brauda-) ist ein traditionelles Nahrungsmittel, das aus einem Teig aus gemahlenem Getreide (Mehl), Wasser, einem Triebmittel und meist weiteren Zutaten gebacken wird. Brot zählt zu den Grundnahrungsmitteln."
# b = "Eine enge Gangabstufung ist auch für Transportarbeiten günstig, da das Verhältnis von Leistung und Gesamtgewicht des Zuges bei Traktoren häufig geringer ist als bei Lastkraftwagen."
a = "Getreide"
b = "Luftmasche"
query = "Brot"
a = test(a)
b = test(b)
query = test(query)


diff1 = np.linalg.norm(query - a)
diff2 = np.linalg.norm(query - b)
diff3 = np.linalg.norm(a - b)

print(diff1)
print(diff2)
print(diff3)

4.796025
4.8885045
3.2877345


In [35]:
from sklearn.metrics.pairwise import cosine_similarity
df1 = [[1,0,1]]
df2 = [[1,0,0]]
cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
print(cosine_loss(df1,df2))

[[0.70710678]]
[0 0 1]
1.0
