# Test BART fineturn on 15 percents xsum

## Install dependences

In [1]:
!pip install transformers[sentencepiece] transformers[torch] datasets evaluate rouge_score nltk

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4010bfbbf8772cadfb64ebb1df6e1056b603c07200a8b0d3cb4c173e39e8a667
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2


## Download test dataset

In [2]:
from datasets import load_dataset

test_dataset = load_dataset("xsum", split="test")

Downloading builder script:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/954 [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


In [3]:
test_dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})

In [4]:
small_test_dataset = test_dataset.select(range(int(0.15 * len(test_dataset))))
small_test_dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 1700
})

## Download tokenizer and model

In [5]:
from transformers import BartForConditionalGeneration, AutoTokenizer

model_checkpoint = "thdangtr/bart-fineturned-on-15-percents-xsum"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)



Downloading (…)okenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

## Preprocess data

In [6]:
max_input_length = 512
max_target_length = 32

def preprocess_function(sample):
    model_inputs = tokenizer(sample["document"],
                             max_length=max_input_length,
                             truncation=True)
    labels = tokenizer(sample["summary"],
                      max_length=max_target_length,
                      truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

    

In [7]:
tokenized_datasets = small_test_dataset.map(preprocess_function, batched=True)
tokenized_datasets

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1700
})

## Prepare for testing

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(small_test_dataset.column_names)

In [9]:
tokenized_datasets.set_format("torch")
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1700
})

In [10]:
# Create data collator
from transformers import DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
batch_size = 8

# Create data loader
from torch.utils.data import DataLoader

test_dataloader = DataLoader(tokenized_datasets,
                            collate_fn=collator,
                            batch_size=batch_size)

In [12]:
from accelerate import Accelerator

accelerator = Accelerator()
model, tokenizer, test_dataloader = accelerator.prepare(model, tokenizer, test_dataloader)

In [13]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]
    
    return preds, labels

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [17]:
from tqdm.auto import tqdm
import numpy as np
import torch

progress_bar = tqdm(range(len(test_dataloader)))


test_loss = 0
for step, batch in enumerate(test_dataloader):
    model.eval()
    with torch.no_grad():
        output = model(**batch)
        test_loss += output.loss
        
        generated_tokens = accelerator.unwrap_model(model).generate(batch["input_ids"],
                                                                   attention_mask=batch["attention_mask"],
                                                                   max_new_tokens=32,
                                                                   )
        labels = batch["labels"]
    
        generated_tokens = accelerator.pad_across_processes(generated_tokens, dim=1, pad_index=tokenizer.pad_token_id)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=tokenizer.pad_token_id)
        
        generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
        labels = accelerator.gather(labels).cpu().numpy()
        
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_spectial_tokens=True)
        
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
        
        rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)
        progress_bar.update(1)

result = rouge_score.compute()
result = {k: v * 100 for k, v in result.items()}
print(test_loss.cpu()/len(test_dataloader))
print(result)
    

  0%|          | 0/213 [00:00<?, ?it/s]

tensor(1.7229)
{'rouge1': 34.90420853017496, 'rouge2': 15.292277495418743, 'rougeL': 28.453392257846467, 'rougeLsum': 28.46237900455125}
