In [None]:
# imports
# import tf_keras as keras # in case you have errors of matching libraries versions of transformers and tensorflow you can try to uninstall keras and add this import to take the keras from tensorflow instead of the standalone keras
from transformers import pipeline, AutoTokenizer, AutoModel, DataCollatorWithPadding, BartTokenizer, BartForConditionalGeneration,T5ForConditionalGeneration, T5Tokenizer
from transformers import TrainingArguments, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, TrainerCallback,Seq2SeqTrainer
import torch
from datasets import load_dataset, load_from_disk
import evaluate
import numpy as np
from torch.nn.functional import conv1d
from scipy.optimize import linear_sum_assignment
from Utils.computeQASetValidationMetrics import compute_metrics

In [2]:
# for reproducibility - set seed to the training
from transformers import set_seed
set_seed(42)

# Load the model, tokenizer and datasets

In [None]:
# the base model to use - I used mostly the bart-base model due to memory constraints
model_ckpt = "facebook/bart-base" # 't5-small'
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)  #

In [None]:
# this is to set the default values of some of the parameters used in the custom loss function and compute metrics function
# I've used it as global variables mostly to avoid calculating them multiple times in the function in order to get better performance
from Utils.defaultValues import *
set_global_default_values(tokenizer) # this function will calculate the default values once and set them as global variables
from Utils.defaultValues import * # need to re-import the global values after setting them

In [None]:
## this code was used to download all the dataset files and save them locally in datasets folder
## uncomment this code if you want to download the datasets
# import requests
# import json

# base_url = "https://nlp.biu.ac.il/~ron.eliav/qasrl/V-passive_red/"

# # save the datasets to local files
# response = requests.get(base_url+"train.json")
# # print(response.json())
# save_train_file = open("./datasets/train.json", "w")
# json.dump(response.json(), save_train_file, indent = 0)
# save_train_file.close()

# response = requests.get(base_url+"dev.json")
# # print(response.json())
# save_dev_file = open("./datasets/dev.json", "w")
# json.dump(response.json(), save_dev_file, indent = 0)
# save_dev_file.close()

# response = requests.get(base_url+"test.json")
# # print(response.json())
# save_test_file = open("./datasets/test.json", "w")
# json.dump(response.json(), save_test_file, indent = 0)
# save_test_file.close()

In [None]:
# check and create info file for the load_dataset script I've created
# this script will be used to adjust the dataset and combine the sentences and predicates to one input
# so it would be set of QA for each sentence and predicate pair
!datasets-cli test datasets/QASrl.py --save_infos --all_configs --trust_remote_code True

In [5]:
# upload the datasets using the loading script I created for the dataset files
# the script should be at a folder named datasets together with the data files
datasets =  load_dataset('./datasets/QASrl.py', trust_remote_code=True)

In [None]:
datasets

In [6]:
# this cell is formatting the input to the format we wanted for the model
prefix = '' # if needed for other models input
def format_input(examples):
    input = [f'{prefix}”sentence: {sent}\npredicate: {pred}”' for sent, pred in zip(examples['sentence'], examples['predicate'])]
    model_input = tokenizer(input)

    labels = tokenizer(text_target=examples['qa'])
    model_input['labels'] = labels['input_ids']
    model_input['format'] = input
    model_input['input_length'] = [len(l) for l in model_input['input_ids']]
    model_input['label_length'] = [len(k) for k in model_input['labels']]

    return model_input


In [None]:
# formatting the datasets
tokenized_datasets  = datasets.map(format_input, batched=True)

In [8]:
# remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["id", "sentence_id", "sentence", "predicate", "qa", "format", "input_length", "label_length"])

# train the base model (with CrossEntropyLoss)

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # this is to allow the model to allocate memory in smaller segments (good to run this when you have errors of memory allocation)

In [None]:
BATCH_SIZE = 4 # change it according to the memory available
EPOCHS_NUM = 5 # change it accrording to the number of epochs you want to train

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir="trainer_logs",
    overwrite_output_dir=True,
    eval_strategy = "epoch",
    save_strategy ="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS_NUM,
    predict_with_generate=True,
    use_cpu=False,
    save_total_limit = 2,
    load_best_model_at_end=True, # notice that this will save the best model based on the evaluation set at the end of the training
)

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest') # also handles padding for inputs and labels

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train() # for base training

In [None]:
from pathlib import Path
Path('models_save/basic_model_base').mkdir(parents=True, exist_ok=True) # create the folder to save the model

In [28]:
trainer.save_model("models_save/basic_model_base")

# continue to train the base model with CrossEntropyLoss
for comparison with the custom loss metrics

In [None]:
trainer.train() # for comperison

In [None]:
Path('models_save/basic_model_trained').mkdir(parents=True, exist_ok=True)
trainer.save_model("models_save/basic_model_trained")

# continue to train the model with the custom loss function

In [None]:
# load the model from the base state to train with the custom loss function
model_ckpt = "models_save/basic_model_base"
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

In [None]:
LAMBDA1 = 0.8
LAMBDA2 = 0.2

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
QA_sep_tokens = tokenizer(' <QA>')['input_ids'][1:-1]  # without the start and end of line flags
print(QA_sep_tokens)
qa_sep_tokens_tensor = torch.tensor(QA_sep_tokens).to(DEVICE)

A_sep_tokens = tokenizer(' <A>')['input_ids'][1:-1]
print(A_sep_tokens)
a_sep_tokens_tensor = torch.tensor(A_sep_tokens).to(DEVICE)

q_sep_tokens = tokenizer('?')['input_ids'][1:-1]
print(q_sep_tokens)
q_sep_tokens_tensor = torch.tensor(q_sep_tokens).to(DEVICE)

DEFAULT_PADDING_IDX = tokenizer.pad_token_id

In [None]:
from Utils.QASetLossTrainer import QASetLossTrainer

custom_trainer = QASetLossTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    lambda1=LAMBDA1,
    lambda2=LAMBDA2,
    QA_sep_tokens_tensor=qa_sep_tokens_tensor,
    A_sep_tokens_tensor=a_sep_tokens_tensor,
    Q_sep_tokens_tensor=q_sep_tokens_tensor,
    PADDING_IDX=DEFAULT_PADDING_IDX,
    DEVICE=DEVICE
)

In [None]:
custom_trainer.train() # for comperison

In [None]:
Path('models_save/custom_model_trained').mkdir(parents=True, exist_ok=True)
custom_trainer.save_model("models_save/custom_model_trained")

# train all custom model
meaning train the model with the custom loss function from scratch

In [None]:
model_ckpt = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

In [None]:
custom_trainer = QASetLossTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    lambda1=LAMBDA1,
    lambda2=LAMBDA2,
    QA_sep_tokens_tensor=qa_sep_tokens_tensor,
    A_sep_tokens_tensor=a_sep_tokens_tensor,
    Q_sep_tokens_tensor=q_sep_tokens_tensor,
    PADDING_IDX=DEFAULT_PADDING_IDX,
    DEVICE=DEVICE
)

In [None]:
custom_trainer.train() # to compare with the basic_model_base

In [None]:
custom_trainer.train() # to compare with the basic_model_trained

In [None]:
Path('models_save/custom_model').mkdir(parents=True, exist_ok=True)
custom_trainer.save_model("models_save/custom_model")