This notebook is meant to be run on Google Colab.

In this notebook, we performed:

* Fine-tuning a BERT-like model (from the HuggingFace library) on WDC dataset - in the case of Project \#2 the `xlm-roberta-base` model and `Computers medium` dataset.
* Computing of the sentences embeddings for both pretrained and finetuned models
* Computing their similarity for both pretrained and finetuned models
* Saving embeddings and their similarity in files

***

Environment set-up

In [None]:
# !pip install -U sentence-transformers

In [None]:
#from google.colab import drive
#drive.mount('/content/drive/', force_remount=True)

Mounted at /content/gdrive/


Imports

In [None]:
import logging
import datetime
import math

from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Model fine-tuning
Fine-tuning a selected bi-encoders model on WDC to achieve better quality embeddings


In [None]:
# Imports of self-defined modules
# setting path
import os, sys
current_dir = os.path.abspath('')
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)


from source.load_data.wdc.load_wdc_dataset import get_wdc_dataset

In [None]:
dataset_type = 'computers'
dataset_size = 'medium'
train_batch_size = 16
num_epochs = 80 # since xml-roberta has more parameters

In [None]:
model_name = 'xlm-roberta-base' 
model_save_path = '/content/drive/MyDrive/NLP/output/training_wdc_'+dataset_type+'_'+dataset_size+'_'+model_name.replace("/", "-")+'-'+datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [None]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
train_samples = get_wdc_dataset(dataset_type, dataset_size, is_train=True, features_to_concat=['title'])
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
test_samples = get_wdc_dataset(dataset_type, dataset_size, is_train=False, features_to_concat=['title'])
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='test_evaluation')

In [None]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model and save on a google drive
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

***

# Preparing and saving embedding results

In [None]:
# Imports of self-defined modules
from source.emb_prep_res.compute_save_emb import get_embedding_records, get_embeddings_pairs, create_csv_file
from source.emb_prep_res.compute_save_similiarity import compute_and_save_similarity_scores

In [None]:
dir = f'/content/drive/MyDrive/NLP/embeddings/{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
embeddings_file_path_train = f'{dir}/train_embeddings.csv'
embeddings_file_path_test = f'{dir}/test_embeddings.csv'

field_names = ['id', 'embedding']
model_save_path = '/content/drive/MyDrive/NLP/output/training_wdc_'+dataset_type+'_'+dataset_size+'_'+model_name.replace("/", "-")+'-'+datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

Reading saved model after fine-tuning from local path:

In [None]:
model = SentenceTransformer(model_save_path)

Getting embeddings

In [None]:
train_embeddings_1, train_embeddings_2 = get_embeddings_pairs(train_samples, model, batch_size=16)
test_embeddings_1, test_embeddings_2 = get_embeddings_pairs(test_samples, model, batch_size=16)

Saving embeddings for each offer

In [None]:
train_records = get_embedding_records(train_samples, train_embeddings_1, train_embeddings_2)
test_records = get_embedding_records(test_samples, test_embeddings_1, test_embeddings_2)

In [None]:
create_csv_file(embeddings_file_path_train, field_names, train_records)
create_csv_file(embeddings_file_path_test, field_names, test_records)

Computing and saving csv files with similarity measures for each pair

In [None]:
dir_sim = f'/content/drive/MyDrive/NLP/similarity/{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
test_similarity_path = f'{dir_sim}/test_similarity.csv'
train_similarity_path = f'{dir_sim}/train_similarity.csv'

In [None]:
compute_and_save_similarity_scores(train_similarity_path, train_samples, train_embeddings_1, train_embeddings_2)
compute_and_save_similarity_scores(test_similarity_path, test_samples, test_embeddings_1, test_embeddings_2)