<a href="https://colab.research.google.com/github/emilstahl97/Scalable-Machine-Learning-and-Deep-Learning-ID2223/blob/notebooks/lab2_id2223.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install sentence_transformers
!pip install transformers
!pip install tokenizers
!pip install torch
!pip install wget
!pip install pyspark



In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import os
import re
import csv
import wget
import json
import math
import scipy
import torch
import string
import sklearn

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from sentence_transformers import SentenceTransformer
from sentence_transformers import LoggingHandler
from sentence_transformers import models, losses, util
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

from transformers import BertTokenizer, TFBertModel, BertConfig
from tokenizers import BertWordPieceTokenizer

from torch.utils.data import DataLoader

from datetime import datetime

**Mount Google Drive to load saved models**

In [24]:
# README - Execute this cell to mount the notebook in your google drive. 
# Execute the cell and follow the link to sign and, paste the given key in the little text box. The credentials are only available for you. 

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

if not os.path.exists("/content/drive/MyDrive/stsbenchmark"): 
  os.mkdir("/content/drive/MyDrive/stsbenchmark")
  os.chdir("/content/drive/MyDrive/stsbenchmark")
  !git clone https://github.com/emilstahl97/stsbenchmark.git
  !git pull
else:
  print("Dataset exists")
  os.chdir("/content/drive/MyDrive/stsbenchmark")

train_path = "/content/drive/MyDrive/stsbenchmark/stsbenchmark/sts-train.csv"
test_path = "/content/drive/MyDrive/stsbenchmark/stsbenchmark/sts-test.csv"
dev_path = "/content/drive/MyDrive/stsbenchmark/stsbenchmark/sts-dev.csv"


Mounted at /content/drive
Dataset exists


## **REGRESSION**

In [25]:
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
columns = ['title', 'type', 'year', 'id', 'score', 'sentence_1', 'sentence_2']

In [27]:
print('Loading train dataset ...')

os.path.isfile(train_path)

train_samples = []

with open(train_path, newline='') as train:
    df_train = csv.DictReader(train, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_train:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        train_samples.append(input_example)

Loading train dataset ...


In [28]:
print('Loading test dataset ...')

os.path.isfile(test_path)

test_samples = []

with open(test_path, newline='') as test:
    df_test = csv.DictReader(test, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_test:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        test_samples.append(input_example)

Loading test dataset ...


In [29]:
print('Loading evaluation dataset ...')

os.path.isfile(dev_path)

dev_samples = []

with open(dev_path, newline='') as dev:
    df_dev = csv.DictReader(dev, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in df_dev:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence_1'], row['sentence_2']], label=score)
        dev_samples.append(input_example)

Loading evaluation dataset ...


Considering the given paper "*Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks*"

In [30]:
train_batch_size = 16
# train_batch_size = 32 # try to speed up the training

learn_rate = 2e-5
num_epochs = 1

Mean-pooling strategy

In [31]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

Define the model

In [32]:
# custom model using mean pooling of the word embeddings given as input
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Load the training set and define the loss function as the cosine similarity

In [33]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

Define the evaluator for the sentence embeddings

In [34]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

10% of train dataset for warm-up

In [35]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

**Training**

In [36]:
save_path = './training_sts_reg_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [37]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
            optimizer_class=torch.optim.Adam,
            optimizer_params={'lr': learn_rate},
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

**Evaluation on STS benchmark dataset**

Mathematical relationship: *cosine_similarity = 1 - cosine_distance*

In [38]:
print('Loading the stored model ...')
model = SentenceTransformer(save_path)

Loading the stored model ...


In [39]:
test_eval = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
c_s = test_eval(model, output_path=save_path)
print('Cosine similarity with the sentence_transformers library = ', c_s)

# sometimes the result is between 0.7 and 0.8

Cosine similarity with the sentence_transformers library =  0.5534566335811169


Embedding sentences

In [40]:
df_test = pd.read_csv(test_path, sep='\t', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
df_test.columns = columns

b'Skipping line 626: expected 7 fields, saw 9\nSkipping line 627: expected 7 fields, saw 9\nSkipping line 628: expected 7 fields, saw 9\nSkipping line 629: expected 7 fields, saw 9\nSkipping line 630: expected 7 fields, saw 9\nSkipping line 631: expected 7 fields, saw 9\nSkipping line 632: expected 7 fields, saw 9\nSkipping line 633: expected 7 fields, saw 9\nSkipping line 634: expected 7 fields, saw 9\nSkipping line 635: expected 7 fields, saw 9\nSkipping line 636: expected 7 fields, saw 9\nSkipping line 637: expected 7 fields, saw 9\nSkipping line 638: expected 7 fields, saw 9\nSkipping line 639: expected 7 fields, saw 9\nSkipping line 640: expected 7 fields, saw 9\nSkipping line 641: expected 7 fields, saw 9\nSkipping line 642: expected 7 fields, saw 9\nSkipping line 643: expected 7 fields, saw 9\nSkipping line 644: expected 7 fields, saw 9\nSkipping line 645: expected 7 fields, saw 9\nSkipping line 646: expected 7 fields, saw 9\nSkipping line 647: expected 7 fields, saw 9\nSkipping

In [41]:
embed_1 = model.encode(df_test['sentence_1'], convert_to_numpy=True, batch_size=train_batch_size)
embed_2 = model.encode(df_test['sentence_2'], convert_to_numpy=True, batch_size=train_batch_size)

Compute the cosine similarity

In [42]:
cos_sim = 1 - sklearn.metrics.pairwise.paired_cosine_distances(embed_1, embed_2)
print('Cosine similarity = ', cos_sim)

Cosine similarity =  [0.37169957 0.9172679  0.6110909  ... 0.7942807  0.87538934 0.93065506]


Spearmean correlation coefficient

In [43]:
spr_corr = scipy.stats.spearmanr(cos_sim, df_test['score'])
print('Spearmean correlation coefficient = ', spr_corr[0])

Spearmean correlation coefficient =  0.5707844359183518


**Comment:** the two results match each other