# THIS IS THE NOTEBOOK FOR FINE-TUNING THE PRE-TRAINED BART MODEL

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install datasets transformers rouge-score nltk py7zr
from IPython.display import clear_output 
clear_output()

In [None]:
#IMPORTING LIBRARIES AND THE DATASET

from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
import nltk
nltk.download('punkt')
import numpy as np

model_checkpoint = "facebook/bart-large-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
raw_dataset = load_dataset("samsum")
metric = load_metric("rouge")

clear_output()

In [None]:
#PREPROCESSING THE DATA

max_input_length = 512
max_target_length = 128

def preprocess_data(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
tokenized_dataset = raw_dataset.map(preprocess_data, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'dialogue', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['attention_mask', 'dialogue', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['attention_mask', 'dialogue', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 818
    })
})

In [None]:
#FINE-TUNING THE MODEL

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
batch_size = 4
args = Seq2SeqTrainingArguments(
    "/content/bart_large_xsum_samsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
#TRAINING AND EVALUATION

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id.
***** Running training *****
  Num examples = 14732
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 5523
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,1.3848,1.447523,53.3731,28.8977,44.133,48.8064,25.7922


Saving model checkpoint to /content/bart_large_xsum_samsum/checkpoint-500
Configuration saved in /content/bart_large_xsum_samsum/checkpoint-500/config.json
Model weights saved in /content/bart_large_xsum_samsum/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/bart_large_xsum_samsum/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/bart_large_xsum_samsum/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/bart_large_xsum_samsum/checkpoint-1000
Configuration saved in /content/bart_large_xsum_samsum/checkpoint-1000/config.json
Model weights saved in /content/bart_large_xsum_samsum/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/bart_large_xsum_samsum/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/bart_large_xsum_samsum/checkpoint-1000/special_tokens_map.json
  args.max_grad_norm,
Saving model checkpoint to /content/bart_large_xsum_samsum/checkpoint-1500
Configurat

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,1.3848,1.447523,53.3731,28.8977,44.133,48.8064,25.7922
1,1.0475,1.478893,54.2868,29.1936,44.4965,49.7239,29.2983
2,0.8565,1.488593,54.532,29.6923,45.1498,50.2025,30.198


Saving model checkpoint to /content/bart_large_xsum_samsum/checkpoint-3500
Configuration saved in /content/bart_large_xsum_samsum/checkpoint-3500/config.json
Model weights saved in /content/bart_large_xsum_samsum/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in /content/bart_large_xsum_samsum/checkpoint-3500/tokenizer_config.json
Special tokens file saved in /content/bart_large_xsum_samsum/checkpoint-3500/special_tokens_map.json
Deleting older checkpoint [/content/bart_large_xsum_samsum/checkpoint-2500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 4
Saving model checkpoint to /content/bart_large_xsum_samsum/checkpoint-4000
Configuration saved in /content/bart_large_xsum_samsum/checkpoint-4000/config.json
Model weights saved in /content/bart_large_xsum_sa

TrainOutput(global_step=5523, training_loss=1.116441732900807, metrics={'train_runtime': 4990.0158, 'train_samples_per_second': 8.857, 'train_steps_per_second': 1.107, 'total_flos': 3.003043192720589e+16, 'train_loss': 1.116441732900807, 'epoch': 3.0})

In [None]:
################################################################################################
########## SAVE THE MODEL AND USE IT IN THE FORM OF A PIPEPLINE AS DONE IN THIS CELL ##########
################################################################################################

from transformers import pipeline

summarizer = pipeline("summarization", model="/content/bart_large_xsum_samsum/checkpoint-5000")


conversation = '''Kartik: We want the model to be able to capture the significant details from a conversation thread.
Nidhir: What should we do in order to achieve that? We can't expect it to do topical segmentaion. It's a pretty difficult task.
Aakash: Yes, moreover, we might encounter long-stretched topics in the transcript.
Kartik: I'm planning to segment the transcript on the basis of token length. We should find a sweet spot at which the model doesn't have to choose between multiple topics, at the same time, extract the relevant info from it.
Aakash: Sounds promising.
Nidhir: Hey, what happened to the model that we trained?
Aakash: I think, it would've finished evaluating on the data.
Kartik: If it's done evaluating, send me the performances ASAP.
Nidhir: Okay, sure.
Kartik: And yes, don't forget to upload those on this link - https://github.com/cruxieu17/automin-2021-submission 🙂.
Aakash: Alright, consider it done.
Kartik: K then, Bye!
Nidhir: Bye bye.                                      
'''
summary = summarizer(conversation)
summary = summary[0]['summary_text']

clear_output()
print(summary)

Kartik and Aakash are developing a machine learning model. Kartik wants the model to be able to capture the significant details from a conversation thread. He is planning to segment the transcript on the basis of token length.
