In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import os
import time 
import glob
import pathlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset as Dataset_hf
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from datasets import load_metric, DatasetDict
import pandas as pd
from tqdm import tqdm

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Read Data

In [4]:
os.listdir('/kaggle/input/bbc-news-summary/BBC News Summary/Summaries')
articles_path = '../input/bbc-news-summary/BBC News Summary/News Articles'
summaries_path = '../input/bbc-news-summary/BBC News Summary/Summaries'
categories_list = ['politics', 'sport', 'tech', 'entertainment', 'business']

In [5]:
def read_files_from_folders(articles_path, summaries_path, categories_list=['tech', 'sport'], encoding = "ISO-8859-1"):
    articles = []
    summaries = []
    categories = []
    for category in categories_list:
        article_paths = glob.glob(os.path.join(articles_path, category, '*.txt'), recursive=True)
        summary_paths = glob.glob(os.path.join(summaries_path, category, '*.txt'), recursive=True)
        
        if len(article_paths) != len(summary_paths):
            print('number of files is not equal')
            return
        for i in range(len(article_paths)):
            categories.append(category)
            with open(article_paths[i], mode='r', encoding = encoding) as file:
                articles.append(file.read())
            
            with open(summary_paths[i], mode='r', encoding = encoding) as file:
                summaries.append(file.read())
    return articles, summaries, categories

In [6]:
articles, summaries, categories = read_files_from_folders(articles_path, summaries_path, categories_list)
df = pd.DataFrame({'articles':articles, 'summaries':summaries, 'categories':categories})

In [7]:
df

Unnamed: 0,articles,summaries,categories
0,Budget to set scene for election\n\nGordon Bro...,- Increase in the stamp duty threshold from Â£...,politics
1,Army chiefs in regiments decision\n\nMilitary ...,"""They are very much not for the good and will ...",politics
2,Howard denies split over ID cards\n\nMichael H...,Michael Howard has denied his shadow cabinet w...,politics
3,Observers to monitor UK election\n\nMinisters ...,The report said individual registration should...,politics
4,Kilroy names election seat target\n\nEx-chat s...,"UKIP's leader, Roger Knapman, has said he is g...",politics
...,...,...,...
2220,India opens skies to competition\n\nIndia will...,India will allow domestic commercial airlines ...,business
2221,Yukos bankruptcy 'not US matter'\n\nRussian au...,Yukos says a US court was entitled to declare ...,business
2222,Survey confirms property slowdown\n\nGovernmen...,House prices were 11.8% higher on the year in ...,business
2223,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business


In [8]:
df = df[['articles', 'summaries']]
df = df.dropna()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
print(df.iloc[0]['articles'])

Budget to set scene for election

Gordon Brown will seek to put the economy at the centre of Labour's bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from Â£60,000. But the Conservatives and Lib Dems insist voters face higher taxes and more means-testing under Labour.

Treasury officials have said there will not be a pre-election giveaway, but Mr Brown is thought to have about Â£2bn to spare.

- Increase in the stamp duty threshold from Â£60,000 
 - A freeze on petrol duty 
 - An extension of tax credit scheme for poorer families 
 - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties' general election manifestos. Ten years ago, buyers had a much greater chance of avoi

In [10]:
print(df.iloc[0]['summaries'])

- Increase in the stamp duty threshold from Â£60,000 - A freeze on petrol duty - An extension of tax credit scheme for poorer families - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties' general election manifestos.The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from Â£60,000.The Tories are also thought likely to propose increased thresholds, with shadow chancellor Oliver Letwin branding stamp duty a "classic Labour stealth tax".Tax credits As a result, the number of properties incurring stamp duty has rocketed as has the government's tax take.Since then, average UK property prices have more than doubled while the starting threshold for stamp duty has not increased.For the Lib Dems David Laws said: "The chancellor will no doubt tell us today how wonderfully the economy is doing," he said.The Liberal Democrats unveiled their own proposals to raise the sta

## Model

### Dataset

In [11]:
class TextDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_len):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]
        encoded_article = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        encoded_summary = self.tokenizer.encode_plus(
            summary,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoded_article['input_ids'].flatten()
        output_ids = encoded_summary['input_ids'].flatten()
        
        return input_ids, output_ids

In [12]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

In [13]:
test_df.head()

Unnamed: 0,articles,summaries
414,David Blunkett in quotes\n\nDavid Blunkett - w...,"""I don't think anyone can say I have said one ..."
420,Benitez issues warning to Gerrard\n\nLiverpool...,"Benitez responded: ""I spoke to Steven and said..."
1644,Brookside creator's Channel 4 bid\n\nThe creat...,Redmond also brought teen soap Hollyoaks to Ch...
416,Brown visits slum on Africa trip\n\nChancellor...,"Speaking outside the Olympic Primary School, M..."
1232,Gritty return for Prince of Persia\n\nStill ba...,Still basking in the relatively recent glory o...


In [14]:
max_len = 700

train_dataset = TextDataset(train_df['articles'].values, train_df['summaries'].values, tokenizer, max_len)
test_dataset = TextDataset(test_df['articles'].values, test_df['summaries'].values, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

### BART before FT

In [34]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

In [35]:
model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_l

In [36]:
summary_ids = model.generate(torch.unsqueeze(train_dataset[0][0], 0).to(device), max_length=300, num_beams=4, early_stopping=True)

In [37]:
reference_summary = tokenizer.decode(train_dataset[0][1], skip_special_tokens=True)

In [38]:
summary_bart = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [39]:
summary_bart

"Blair buys copies of new Band Aid 20 in Edinburgh. Predicted chart-topper Do They Know it's Christmas? expected to sell at least 300,000 copies by the time the new chart is announced on Sunday. New version of 1984 single is not going to be released in the US, despite being sold in many countries around the world."

In [40]:
reference_summary

'It is predicted that the Band Aid 20 song will sell 300,000 copies in the UK by the time the new chart is announced on Sunday.The original track was released in the US, and reached number 13 in the singles chart.US record shops are stocking an import version of Do They Know It\'s Christmas, which is said to be selling very well in Los Angeles and New York.Prime Minister Tony Blair purchased two copies of the charity single Band Aid 20 in Edinburgh on Friday.However, the new version of the 1984 single is not going to be released in the US, despite being sold in many countries around the world.But music fans in the US are still able to access the song and download it on Band Aid 20\'s official website."Our customer helper approached him... it was only then we realised he wanted to buy copies of the Band Aid single, rather than the latest Eminem album."It topped the US charts for three weeks and went on win Grammy awards for best record and song.'

In [23]:
!pip  install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=caf5c30813f56f6a6fdbfac9d441805007da8ab91234e4d172cefca1b566837d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [24]:
from rouge_score import rouge_scorer

def evaluate_summary(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

In [25]:
scores = evaluate_summary(reference_summary, summary_bart)

In [26]:
scores['rougeL'].fmeasure

0.326530612244898

In [45]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [44]:
rouge_metric = load_metric('rouge')

In [52]:
def eval_model(dataloader, model, tokenizer, metric):
    rouge1 = 0
    rouge2 = 0
    rougeL = 0
    total_size = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            
            input_ids = batch[0]
            output_ids = batch[1]
            summary_ids = model.generate(input_ids.to(device), max_length=300, num_beams=4, early_stopping=True)
            
            reference_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                clean_up_tokenization_spaces=True)
                for s in output_ids]
#             print(reference_summaries)
#             print(summaries)
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                clean_up_tokenization_spaces=True)
                for s in summary_ids]
            
            metric.add_batch(predictions=decoded_summaries, references=reference_summaries)
#                 summary_bart = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#                 scores = evaluate_summary(reference_summary, suaaaaaammary_bart)

#                 rouge1 += scores['rouge1'].fmeasure
#                 rouge2 += scores['rouge2'].fmeasure 
#                 rougeL += scores['rougeL'].fmeasure
#                 total_size += 1
    score = metric.compute()
    return score
            

In [53]:
eval_model(test_loader, model, tokenizer, rouge_metric)

100%|██████████| 223/223 [06:07<00:00,  1.65s/it]


{'rouge1': AggregateScore(low=Score(precision=0.6784137730134573, recall=0.24276928062684744, fmeasure=0.3463545348371122), mid=Score(precision=0.6927590063768461, recall=0.2526000250504504, fmeasure=0.3574386293017163), high=Score(precision=0.7086401666980815, recall=0.2627820547506553, fmeasure=0.3689637106222796)),
 'rouge2': AggregateScore(low=Score(precision=0.4422495513872204, recall=0.15882829528674625, fmeasure=0.22606406052664796), mid=Score(precision=0.4625605016268195, recall=0.16890903768190518, fmeasure=0.23896399203520202), high=Score(precision=0.48523792609535826, recall=0.1802520973242954, fmeasure=0.25241758855962165)),
 'rougeL': AggregateScore(low=Score(precision=0.48820203396568673, recall=0.1746472423427189, fmeasure=0.24921796727097378), mid=Score(precision=0.502584743081224, recall=0.18329749998664785, fmeasure=0.2593543645757802), high=Score(precision=0.5161402558053944, recall=0.19174636425874556, fmeasure=0.2691043170121561)),
 'rougeLsum': AggregateScore(low=

### Fine-Tune

In [None]:
trainer_args=TrainingArguments(
    run_name='bbc-news',
    output_dir='bbc-news-model', 
    num_train_epochs=10, 
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    eval_strategy='steps', 
    eval_steps=500, 
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
dataset_d = {
    'test': Dataset_hf.from_pandas(test_df),
    'validation': Dataset_hf.from_pandas(test_df),
    'train': Dataset_hf.from_pandas(train_df)
}
dataset = DatasetDict(dataset_d)

In [None]:
test_df

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['articles'] , max_length = 700, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summaries'], max_length = 300, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [64]:
dataset_pt = dataset.map(convert_examples_to_features, batched=True, batch_size=16)

Map:   0%|          | 0/445 [00:00<?, ? examples/s]



Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

In [68]:
trainer=Trainer(model=model, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_pt["train"],
                  eval_dataset=dataset_pt["validation"])

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,0.0564,0.231054


In [72]:
eval_model(test_loader, model, tokenizer, rouge_metric)

100%|██████████| 223/223 [17:57<00:00,  4.83s/it]


{'rouge1': AggregateScore(low=Score(precision=0.8013393821521264, recall=0.6938685004719378, fmeasure=0.7375167354092009), mid=Score(precision=0.8131219103140572, recall=0.7081617982270438, fmeasure=0.7487988777537756), high=Score(precision=0.8240036833317839, recall=0.7213849398350173, fmeasure=0.7600368370918428)),
 'rouge2': AggregateScore(low=Score(precision=0.7146590943300953, recall=0.6204273480525928, fmeasure=0.6582604389862796), mid=Score(precision=0.7297648552262485, recall=0.6352996013545167, fmeasure=0.6719239292738918), high=Score(precision=0.7437386800286716, recall=0.6490921732315023, fmeasure=0.6850040807134659)),
 'rougeL': AggregateScore(low=Score(precision=0.5743170991783672, recall=0.5017457263898356, fmeasure=0.5305774902792112), mid=Score(precision=0.5881472915838406, recall=0.5150160149570713, fmeasure=0.5432880515937843), high=Score(precision=0.6003173851285254, recall=0.5299307385915963, fmeasure=0.5566123654768885)),
 'rougeLsum': AggregateScore(low=Score(prec

In [80]:
summary_ids = model.generate(torch.unsqueeze(test_dataset[0][0], 0).to(device), max_length=300, num_beams=4, early_stopping=True)

In [81]:
summary_bart = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [82]:
reference_summary = tokenizer.decode(test_dataset[0][1], skip_special_tokens=True)

In [83]:
reference_summary

'"I don\'t think anyone can say I have said one thing in public and done another in private.""Our work with the French government...has been hugely successful," said Mr Blunkett.David Blunkett - who has resigned as home secretary - built his reputation as a plain-speaking Yorkshire man."It would be dangerous territory if I wasn\'t practising what I preach which is to always accept responsibility, always accept the consequences of your actions."I foolishly thought as this was a celebrity edition it would be more relaxed than normal.""Trust, plain-speaking and straight talking is something which matters so much to me as a politician and as a man that I have decided, of my own volition, to request an independent review of the allegations that I misused my position."'

In [84]:
summary_bart

'David Blunkett - who has resigned as home secretary - built his reputation as a plain-speaking Yorkshire man."I don\'t think anyone can say I have said one thing in public and done another in private."You have just got to think for a minute: is it too early to open a bottle?""I accepted by necessity we have to have prevention under a new category which is to intervene before the act is committed, rather than do so by due process after the acts is committed when it\'s too late," he said in reference to new anti-terrorism measures.I fell in love with someone and they wouldn\'t go public and things started to go very badly wrong in the summer, and then the News of the World picked up the story."I foolishly thought as this was a celebrity edition it would be more relaxed than normal."'