In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from preprocessor import Preprocessor
import random
import os
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
train_preprocessor = Preprocessor()

data = pd.read_csv("../../data/wikipedia.csv")
# clean out where text is NaN
data = data[data.text.notna()]

In [3]:
doc_vectors = np.array([train_preprocessor.vectorize(text) for text in data.text])
print(doc_vectors.shape)

(359, 300)


In [48]:
features = tf.constant(doc_vectors, shape=(1,359,300))
labels = tf.constant(data.label.to_numpy(), shape=(1,359,1))

#train_dataset = tf.data.Dataset.from_tensor_slices((features,labels))
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)

train_dataset = tf.data.Dataset.zip((features_dataset, labels_dataset))

In [4]:
# X_train, X_test, y_train, y_test = train_test_split(doc_vectors, data.label, test_size=0.1, random_state=1)

In [41]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(300,)),  # input shape required
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(3)
])

In [6]:
loss_object = tf.keras.losses.MeanSquaredError()

def get_new_y(predictions, labels):
  new_y = []
  for idx, val in enumerate(predictions.numpy()):
      label = labels[idx].numpy()
      relevant_data = []
      for idx2, value in enumerate(labels.numpy()):
          if(value == label and idx2 != idx):
              relevant_data.append(predictions[idx2])
      new_y.append(relevant_data[random.randint(0,len(relevant_data)-1)])
  return new_y

def loss(model, x, y, training):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(x, training=training)
  new_y = get_new_y(y_, y)

  return loss_object(y_true=new_y, y_pred=y_)

def grad(model, inputs, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, inputs, targets, training=True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [7]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

In [65]:
## Note: Rerunning this cell uses the same model variables

# Keep results for plotting
train_loss_results = []
train_accuracy_results = []
num_epochs = 10

for epoch in range(num_epochs):
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.MeanAbsoluteError()

  # Training loop - using batches of 32
  for x, y in train_dataset:
    # Optimize the model
    loss_value, grads = grad(model, x, y)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss
    # Compare predicted label to actual label
    # training=True is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(x, training=True)
    new_y = get_new_y(predictions, y)
    epoch_accuracy.update_state(new_y, predictions)

  # End epoch
  train_loss_results.append(epoch_loss_avg.result())
  train_accuracy_results.append(epoch_accuracy.result())

  print("Epoch {:03d}: Loss: {:.3f}, MAE: {:.3}".format(epoch,epoch_loss_avg.result(),epoch_accuracy.result()))

(359, 300)
Epoch 000: Loss: 0.110, MAE: 0.22
(359, 300)
Epoch 001: Loss: 0.102, MAE: 0.217
(359, 300)
Epoch 002: Loss: 0.090, MAE: 0.215
(359, 300)
Epoch 003: Loss: 0.092, MAE: 0.196
(359, 300)
Epoch 004: Loss: 0.094, MAE: 0.196
(359, 300)
Epoch 005: Loss: 0.085, MAE: 0.189
(359, 300)
Epoch 006: Loss: 0.074, MAE: 0.175
(359, 300)
Epoch 007: Loss: 0.070, MAE: 0.194
(359, 300)
Epoch 008: Loss: 0.067, MAE: 0.179
(359, 300)
Epoch 009: Loss: 0.072, MAE: 0.171
