## Install Dependencies

In [None]:
!pip install datasets transformers rouge-score nltk git-lfs

In [None]:
import transformers

print(transformers.__version__)

In [None]:
## New fine-tune model name 
fine_tuned_model = 'new-model-name'

## Training Data
#### Please select the related training data file based on the objective

In [None]:
'''Standart Objectives'''
##
# Summarisation
train_data = '../datasets/train/abstracts_new_init_clean.csv'

# Paraphrasing
train_data = '../datasets/train/pubmed_paraphrased_batch_all.csv'

# Mask-filling
train_data = '../datasets/train/med_masked_pubmed_articles.csv'


'''Hybrid Objectives'''
##
# Paraphrasing-to-Masking
train_data = '../datasets/train/med_masked_pubmed_articles.csv'

# Masking-to-Paraphrasing
train_data = '../datasets/train/pubmed_paraphrased_batch_all.csv'


'''Domain-specific Objectives'''
##
# Med-mask-filling
train_data = '../datasets/train/med_masked_pubmed_articles.csv'

# Med-mask-filling (cm-25)
train_data = '../datasets/train/conditionally_masked_med_terms_pubmed.csv'

# Med-mask-filling (cm-25*)
train_data = '../datasets/train/conditionally_masked_med_terms_pubmed_without0.csv'


In [None]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files=train_data, split='train')

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
from datasets import load_dataset, load_metric

metric = load_metric('rouge')

## Language Model (BART) Fine-tuning

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

## Pre-trained model name used for fine-tuning
model_checkpoint = 'model-name'

model = BartForConditionalGeneration.from_pretrained(model_checkpoint) 
tokenizer = BartTokenizer.from_pretrained(model_checkpoint) 

In [None]:
max_input_length = 512
max_target_length = 512

prefix = ''

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples['masked_highlights']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=max_target_length, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
preprocess_function(train_dataset[:2])

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
batch_size = 16
model_name = model_checkpoint.split('/')[-1]
args = Seq2SeqTrainingArguments(
    fine_tuned_model,
    # evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    evaluation_strategy = 'steps',
    eval_steps = 500,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True, truncation=True,padding='max_length')
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True, truncation=True,padding='max_length')

    rouge_output = metric.compute(predictions=pred_str, references=label_str, rouge_types=['rouge2'])['rouge2'].mid

    return {
        'rouge2_precision': round(rouge_output.precision, 4),
        'rouge2_recall': round(rouge_output.recall, 4),
        'rouge2_fmeasure': round(rouge_output.fmeasure, 4),
    }    

## Git Configuration

In [None]:
!git config --global credential.helper store

## Login to Huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## Fine-tune the Model with Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
trainer.train()

## Upload the Trained Model into Huggingface

In [None]:
trainer.push_to_hub()