In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv('datasets/News Summary/news_summary.csv',encoding='latin-1')

dataset.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [4]:
dataset.rename({'text':'summary'},axis=1,inplace=True)
dataset['text'] = dataset['headlines'] +'. '+ dataset['ctext']
dataset.drop(['author','date','headlines','read_more','ctext'],axis=1,inplace=True)
dataset.dropna(inplace=True)

In [6]:
#Changing Pandas DataFrame to Huggingface Dataset
train_df,eval_df = train_test_split(dataset,test_size=0.2,shuffle=True)

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)

train_df = train_df.remove_columns(['__index_level_0__'])
eval_df = eval_df.remove_columns(['__index_level_0__'])
    

In [7]:
new_dataset = DatasetDict({'train_data':train_df,'eval_data':eval_df})
new_dataset

DatasetDict({
    train_data: Dataset({
        features: ['summary', 'text'],
        num_rows: 3516
    })
    eval_data: Dataset({
        features: ['summary', 'text'],
        num_rows: 880
    })
})

In [8]:
import torch
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [9]:
#Importing pre-trained BART model
model_name = 'facebook/bart-large-cnn'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained( model_name)

In [10]:
def tokenize_data(data):
    tokenized_data = tokenizer(data['text'],max_length=512, truncation=True, padding='max_length', return_tensors='pt')
    tokenized_data['labels'] = tokenizer(data['summary'],max_length=128, truncation=True, padding='max_length', return_tensors='pt')['input_ids']
    return tokenized_data

In [11]:
tokenized_data = new_dataset.map(tokenize_data,batched=True)

Map: 100%|██████████| 3516/3516 [00:02<00:00, 1281.89 examples/s]
Map: 100%|██████████| 880/880 [00:00<00:00, 1669.26 examples/s]


In [12]:
tokenized_data = tokenized_data.remove_columns(['summary','text'])

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,padding=True)

In [15]:
#Importing ROUGE metric to evaluate model
import evaluate

rouge = evaluate.load('rouge')

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bart-summarizer",
    eval_strategy= "steps",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    fp16=True,
    weight_decay=0.01,
    predict_with_generate=True
)

In [18]:
trainer = Seq2SeqTrainer(
    model = model,
    args=training_args,
    train_dataset= tokenized_data['train_data'],
    eval_dataset= tokenized_data['eval_data'],
    data_collator= data_collator,
    tokenizer= tokenizer,
    compute_metrics=compute_metrics,
)

In [19]:
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()


In [20]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
100%|██████████| 440/440 [2:20:33<00:00, 19.17s/it]

{'train_runtime': 8433.1203, 'train_samples_per_second': 0.417, 'train_steps_per_second': 0.052, 'train_loss': 0.9737814469770951, 'epoch': 1.0}





TrainOutput(global_step=440, training_loss=0.9737814469770951, metrics={'train_runtime': 8433.1203, 'train_samples_per_second': 0.417, 'train_steps_per_second': 0.052, 'total_flos': 3809769890512896.0, 'train_loss': 0.9737814469770951, 'epoch': 1.0})