In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install datasets transformers rouge-score nltk py7zr
import torch
torch.cuda.manual_seed(0)
device = torch.device('cuda')

import re
import json
import numpy as np
from IPython.display import clear_output 
clear_output()

model_name = 'bart_large_xsum'
fn_dataset = 'mediasum'

In [None]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/Journal/utils')
from utils import stripp, replace_apos, replace_phrases, clean_punc, rem_punc, rem_multispace, rem_repeating, rem_fillers, rem_stopwords, clean, check_context, check_req, insert_pronouns, format_summary_, gen_tscs, gen_summaries_
from load_dataset_FT import load
clear_output()

In [None]:
#IMPORTING LIBRARIES AND THE DATASET

from transformers import AutoTokenizer
model_checkpoint = "facebook/bart-large-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
train_data, val_data = load(fn_dataset)

from datasets import load_metric
metric = load_metric("rouge")
clear_output()

In [None]:
#PREPROCESSING THE DATA

max_input_length = 1024
max_target_length = 128

def preprocess_data(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess_data, batched=True)
tokenized_val = val_data.map(preprocess_data, batched=True)
del train_data, val_data

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_train

Dataset({
    features: ['attention_mask', 'dialogue', 'id', 'input_ids', 'labels', 'summary'],
    num_rows: 12460
})

In [None]:
### FINE-TUNING THE MODEL ###

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

In [None]:
batch_size = 8
save_path = "/content/drive/MyDrive/Journal/ckpt/{}_{}".format(model_name, fn_dataset)
args = Seq2SeqTrainingArguments(
    save_path,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id.
***** Running training *****
  Num examples = 12460
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 3116


Epoch,Training Loss,Validation Loss


RuntimeError: ignored

In [None]:
### INFERENCE ###

from transformers import pipeline

summarizer = pipeline("summarization", model="{}/checkpoint-3500".format(save_path))
conversation = '''Kartik: Hey, do you have Tirthankar's number?
Nidhir: Lemme check
Aakash: Sorry, can't find it.
Kartik: Ask Someone
Aakash: Found it! It was saved in some other contact list
Nidhir: Hey, what happened to the model that we trained?
Aakash: I think, it would've finished evaluating on the data.
Kartik: If it's done evaluating, send me the performances ASAP
Nidhir: Okay, sure.
Kartik: And yes, don't forget to upload those on this link - https://github.com/cruxieu17/automin-2021-submission 🙂
Aakash: Alright, consider it done
Kartik: K then, Bye!
Nidhir: Bye bye                                       
'''

summary = summarizer(conversation)[0]['summary_text']
clear_output()
print(summary)

In [None]:
eval_data = 'automin'
tscs_preprocessed = gen_tscs('automin', tokenizer, 1024)

In [None]:
tscs_preprocessed[0]

In [None]:
out_path = "/content/drive/MyDrive/Journal/outputs/{}_{}_{}".format(eval_data, model_name, fn_dataset)
s2, filename = gen_summaries_(tscs_preprocessed, summarizer1, out_path)