In [1]:
!pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import nltk
import collections
import re
import math
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, layers, losses
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from scipy import spatial
import matplotlib.pyplot as plt

## Create pairs definition

In [4]:
def create_pairs(x, y):

  pairs_list = []
  labels_list = []

  for i in range(len(x)):
    for j in range(len(x)):
      if i == j:
        continue;
      if j < i:
        continue;
      pairs_list.append((x[i], x[j]))
      labels_list.append((y[i][0], y[j][0]))

  return np.array(pairs_list), np.array(labels_list)

## BERT Import


In [5]:
!pip install spacy-sentence-bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import spacy_sentence_bert
import spacy

In [7]:
nlp = spacy_sentence_bert.load_model('en_stsb_roberta_large')

## Contrastive loss definition - contrastive + softmax


In [8]:
# https://keras.io/examples/vision/siamese_contrastive/
# Provided two tensors t1 and t2
# Euclidean distance = sqrt(sum(square(t1-t2)))
def euclidean_distance(vects):
    """Find the Euclidean distance between two vectors.

    Arguments:
        vects: List containing two tensors of same length.

    Returns:
        Tensor containing euclidean distance
        (as floating point value) between vectors.
    """

    x, y = vects
    x = tf.math.l2_normalize(x)
    y = tf.math.l2_normalize(y)
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    output = tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))
    return output

def cosine_sim(vects):
    x, y = vects
    x = tf.math.l2_normalize(x)
    y = tf.math.l2_normalize(y)

    return tf.multiply(x,y)

def loss(theta = 0.05, margin=0.5):
    """Provides 'constrastive_loss' an enclosing scope with variable 'margin'.

    Arguments:
        margin: Integer, defines the baseline for distance for which pairs
                should be classified as dissimilar. - (default is 1).

    Returns:
        'constrastive_loss' function with data ('margin') attached.
    """

    # Contrastive loss = mean( (1-true_value) * square(prediction) +
    #                         true_value * square( max(margin-prediction, 0) ))
    def contrastive_loss(y_true, y_pred):

        """Calculates the constrastive loss.

        Arguments:
            y_true: List of labels, each label is of type float32.
            y_pred: List of predictions of same length as of y_true,
                    each label is of type float32.

        Returns:
            A tensor containing constrastive loss as floating point value.
        """

        if (np.sum(y_true.numpy()) == len(y_true.numpy())) | (np.sum(y_true.numpy()) == 0):
          print("batch with only positives or negatives!!")
          square_pred = tf.math.square(y_pred)
          margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
          return tf.math.reduce_mean(
              (1 - y_true) * square_pred + (y_true) * margin_square
          )
        else:
          exp_vec = tf.math.exp(y_pred/theta)
          loss = tf.constant(-tf.math.log(tf.math.reduce_sum(exp_vec*y_true)/tf.math.reduce_sum(exp_vec*(1-y_true))))
          return loss

        # Previous contrastive loss
        # square_pred = tf.math.square(y_pred)
        # margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        # loss = tf.math.reduce_mean(
        #       (1 - y_true) * square_pred + (y_true) * margin_square
        # )
        # return loss


    return contrastive_loss


## Perform constrastive learning


In [12]:
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import f1_score

df = pd.read_csv("/content/drive/MyDrive/NLP_Projekt/IMDB Dataset.csv").iloc[:200,:]
y = np.asarray(df.iloc[:,1])
y[y=="positive"] = 1
y[y=="negative"] = 0
y = y.reshape(-1,1).astype('float32')
X = np.array([nlp(df.iloc[i,0]).vector for i in range(df.shape[0])])

skf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=123)
skf.get_n_splits(X, y)

accuracy_list = []
f1_score_list = []

for train_index, test_index in skf.split(X, y):

  # Select random subsamples
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  X_train = np.array(X_train)
  X_test = np.array(X_test)
  X_train.shape

  df2 = pd.read_csv("/content/drive/MyDrive/NLP_Projekt/IMDB Dataset.csv").iloc[200:9200,:]
  y_test = np.asarray(df2.iloc[:,1])
  y_test[y_test=="positive"] = 1
  y_test[y_test=="negative"] = 0
  y_test = y_test.reshape(-1,1).astype('float32')
  X_test = np.array([nlp(df2.iloc[i,0]).vector for i in range(df2.shape[0])])
  X_test = np.array(X_test)



  # create pairs

  x_train_pairs, y_train_labels = create_pairs(X_train, y_train)
  # x_test_pairs, y_test_labels = create_pairs(X_test, y_test)
  # if labels are the same -> label 0
  # if labels are not the same -> label 1
  y_train_labels = np.where(y_train_labels[:,0] == y_train_labels[:,1], 0, 1)

  # https://keras.io/examples/vision/siamese_contrastive/
  # Provided two tensors t1 and t2
  # Euclidean distance = sqrt(sum(square(t1-t2)))

  len_bag = x_train_pairs.shape[2]

  input = layers.Input((len_bag,))
  x = tf.keras.layers.BatchNormalization()(input)
  x = layers.Dense(2048)(x)
  x = layers.Dropout(0.3)(x)
  x = layers.Dense(1024)(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = layers.Dropout(0.3)(x)
  x = layers.Dense(512)(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = layers.Dropout(0.3)(x)
  x = layers.Dense(256)(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = layers.Dropout(0.3)(x)
  x = layers.Dense(128)(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = layers.Dropout(0.3)(x)
  x = layers.Dense(32)(x)

  embedding_network = tf.keras.Model(input, x)


  input_1 = layers.Input((len_bag,))
  input_2 = layers.Input((len_bag,))

  # As mentioned above, Siamese Network share weights between
  # tower networks (sister networks). To allow this, we will use
  # same embedding network for both tower networks.
  tower_1 = embedding_network(input_1)
  tower_2 = embedding_network(input_2)

  merge_layer = layers.Lambda(cosine_sim)([tower_1, tower_2])
  normal_layer = tf.keras.layers.BatchNormalization()(merge_layer)
  output_layer = layers.Dense(1, activation="sigmoid")(normal_layer)
  siamese = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)

  siamese.compile(loss=loss(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), metrics=["accuracy"], run_eagerly=True)
  siamese.summary()

  x_train_1 = x_train_pairs[:,0,:]
  x_train_2 = x_train_pairs[:,1,:]

  # x_test_1 = x_test_pairs[:,0,:]
  # x_test_2 = x_test_pairs[:,1,:]

  x_train_1 = tf.convert_to_tensor(x_train_1)
  x_train_2 = tf.convert_to_tensor(x_train_2)

  # x_test_1 = tf.convert_to_tensor(x_test_1)
  # x_test_2 = tf.convert_to_tensor(x_test_2)

  y_train_labels = tf.convert_to_tensor(y_train_labels)
  # y_test_labels = tf.convert_to_tensor(y_test_labels)


  y_train_labels = tf.cast(y_train_labels, tf.float32)
  # y_test_labels = tf.cast(y_test_labels, tf.float32)

  # x_test_1 = tf.cast(x_test_1, tf.float32)
  # x_test_2 = tf.cast(x_test_2, tf.float32)

  x_train_1 = tf.cast(x_train_1, tf.float32)
  x_train_2 = tf.cast(x_train_2, tf.float32)


  history = siamese.fit(
      [x_train_1, x_train_2],
      y_train_labels,
      batch_size=32,
      epochs=5
  )

  # a = siamese.predict([x_test_1, x_test_2])
  # results = pd.DataFrame(a)
  # results["true"] = y_test_labels[:,1]
  # results.iloc[:50,:]

  # plt.figure(figsize=(8,10))
  # plt.hist(results[results["true"] == 0][0], color="orange", alpha = 0.5)
  # plt.hist(results[results["true"] == 1][0], color="skyblue", alpha = 0.5)
  # plt.legend(["class 0", "class 1"])
  # plt.title("Probability vs actual")
  # plt.show()

  preds = []

  for i in range(len(X_test)):
      pairs_list = []

      for j in range(len(X_train)):
          pairs_list.append((X_test[i], X_train[j]))

      pairs_list = np.array(pairs_list)

      b = siamese.predict([pairs_list[:,0,:], pairs_list[:,1,:]])
      preds.append(int(b[np.where(y_train.flatten() == 0)].mean() > b[np.where(y_train.flatten() == 1)].mean()))

  accuracy_list.append(np.mean(y_test.flatten()==preds))
  print("Accuracy:")
  print(np.mean(y_test.flatten()==preds))
  f1_score_list.append(f1_score(y_test.flatten(), preds))


[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
Accuracy:
0.7752222222222223


In [13]:
accuracy_list

[0.7792222222222223,
 0.7863333333333333,
 0.7692222222222223,
 0.7752222222222223]

In [15]:
f1_score_list

[0.7858605453173834, 0.7881925322172045, 0.7665505226480837, 0.788012155506654]