# Fineturn BART with Accelerate

## Download dependences

In [1]:
!pip install torch transformers[sentencepiece] datasets evaluate rouge_score nltk transformers[torch]

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=399326402a7e921226442feeda810d86f8d221fd1f5da6d7f93c33b80f7ac24c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.1 rouge_score-0.1.2


## Download Data

In [2]:
from datasets import load_dataset
data_repo = "xsum"

xsum = load_dataset(data_repo)
xsum

Downloading builder script:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/954 [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [3]:
# Get small dataset
from datasets import DatasetDict

small_xsum = DatasetDict()
percents = 0.15
small_xsum["train"] = xsum["train"].select(range(int(percents * len(xsum["train"]))))
small_xsum["validation"]= xsum["validation"].select(range(int(percents * len(xsum["validation"]))))
small_xsum["test"] = xsum["test"].select(range(int(percents * len(xsum["test"]))))
small_xsum

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 30606
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1699
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1700
    })
})

In [4]:
def get_sum_len(sample):
  sum = sample["summary"]
  return {
      "sum_len": len(sum.split())
  }

small_xsum = small_xsum.map(get_sum_len)

  0%|          | 0/30606 [00:00<?, ?ex/s]

  0%|          | 0/1699 [00:00<?, ?ex/s]

  0%|          | 0/1700 [00:00<?, ?ex/s]

In [5]:
small_xsum

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'sum_len'],
        num_rows: 30606
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'sum_len'],
        num_rows: 1699
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'sum_len'],
        num_rows: 1700
    })
})

In [6]:
sum_len = small_xsum["train"][:]["sum_len"]

In [7]:
import pandas as pd

print(pd.Series(sum_len).value_counts())

19    2377
23    2362
22    2352
21    2320
20    2291
18    2244
24    2244
17    2009
25    1826
16    1639
26    1447
15    1186
27    1030
14     824
28     787
13     559
29     478
12     395
30     343
11     273
31     273
32     205
10     156
33     155
34     107
35      90
9       77
36      65
37      52
38      50
8       47
5       44
39      36
7       34
4       31
6       26
40      24
41      24
42      19
45      15
43      15
44      14
3       10
46       9
1        6
47       6
49       5
55       5
53       4
48       4
50       4
51       2
58       2
61       1
65       1
2        1
52       1
Name: count, dtype: int64


## Download Model and Tokenizer

In [8]:
from transformers import BartForConditionalGeneration, AutoTokenizer

model_checkpoint = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)



Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

## Preprocess data

In [9]:
max_input_length = 512
max_target_length = 32

def preprocess_function(sample):
  model_inputs = tokenizer(sample["document"],
                           max_length=max_input_length,
                           truncation=True)
  labels = tokenizer(sample["summary"],
                     max_length=max_target_length,
                     truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [10]:
tokenized_datasets = small_xsum.map(preprocess_function, batched=True)
tokenized_datasets

  0%|          | 0/31 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'sum_len', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30606
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'sum_len', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1699
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'sum_len', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1700
    })
})

## Prepare for train

In [11]:
# Set output format of tokenized_datasets in pytorch tensors
tokenized_datasets.set_format('torch')

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(small_xsum["train"].column_names)

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30606
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1699
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1700
    })
})

In [14]:
# Create data collator
from transformers import DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [15]:
# Create data loader
from torch.utils.data import DataLoader

batch_size = 8

train_dataloader = DataLoader(tokenized_datasets["train"],
                              shuffle=True,
                              collate_fn=collator,
                              batch_size=batch_size)

eval_dataloader = DataLoader(tokenized_datasets["test"],
                             collate_fn=collator,
                              batch_size=batch_size)

In [16]:
# set optimizer
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [17]:
# Accelerate
from accelerate import Accelerator

accelerator = Accelerator()
model, tokenizer, train_dataloader, eval_dataloader = accelerator.prepare(model, tokenizer, train_dataloader, eval_dataloader)

In [18]:
# Set learning rate
from transformers import get_scheduler

num_train_epochs = 4
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [19]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize


def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [label.strip() for label in labels]

  preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
  labels = ["\n".join(sent_tokenize(label)) for label in labels]

  return preds, labels

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from huggingface_hub import get_full_repo_name

model_name = "bart-fineturned-on-15-percents-xsum"
repo_name = get_full_repo_name(model_name)
repo_name

'thdangtr/bart-fineturned-on-15-percents-xsum'

In [23]:
from huggingface_hub import create_repo


try:
  create_repo(repo_name)
  print('success')
except:
  print("exist")

success


In [24]:
from huggingface_hub import Repository

output_dir = "results-bart-fineturned-15-percents-xsum"
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/thdangtr/bart-fineturned-on-15-percents-xsum into local empty directory.


## Training loop

In [25]:
import evaluate
rouge_score = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [26]:
import torch
import numpy as np
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
  model.train()
  train_loss = 0
  for step, batch in enumerate(train_dataloader):
    output = model(**batch)
    loss = output.loss
    accelerator.backward(loss)

    train_loss += loss

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  train_loss /= len(train_dataloader)

  model.eval()
  eval_loss = 0
  for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
      eval_loss += (model(**batch)).loss
      generated_tokens = accelerator.unwrap_model(model).generate(
          batch["input_ids"],
          attention_mask=batch["attention_mask"],
          max_new_tokens=max_target_length
      )

      generated_tokens = accelerator.pad_across_processes(generated_tokens,
                                                          dim=1,
                                                          pad_index=tokenizer.pad_token_id,
                                                          )
      labels = batch["labels"]

      labels = accelerator.pad_across_processes(batch["labels"],
                                                dim=1,
                                                pad_index=tokenizer.pad_token_id,
                                                )
      generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
      labels = accelerator.gather(labels).cpu().numpy()

      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      if isinstance(generated_tokens, tuple):
        generated_tokens = generated_tokens[0]
      decoded_preds = tokenizer.batch_decode(generated_tokens,
                                             skip_special_tokens=True,
                                             )
      decoded_labels = tokenizer.batch_decode(labels,
                                              skip_special_tokens=True)

      decoded_preds, decoded_labels = postprocess_text(decoded_preds,
                                                        decoded_labels)
      rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

  eval_loss /= len(eval_dataloader)
  result = rouge_score.compute()

  result = {key: value * 100 for key, value in result.items()}
  result = {k: round(v, 4) for k, v in result.items()}
  print(f"Epoch: {epoch + 1} | train loss: {train_loss} | eval loss: {eval_loss} | ", result)

  accelerator.wait_for_everyone()
  unwrapped_model = accelerator.unwrap_model(model)
  unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
  if accelerator.is_main_process:
      tokenizer.save_pretrained(output_dir)
      repo.push_to_hub(
          commit_message=f"Training in progress epoch {epoch + 1}", blocking=False
      )

  0%|          | 0/15304 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1 | train loss: 2.0483591556549072 | eval loss: 1.7505038976669312 |  {'rouge1': 38.4245, 'rouge2': 16.7217, 'rougeL': 31.5428, 'rougeLsum': 31.5302}
Epoch: 2 | train loss: 1.6635339260101318 | eval loss: 1.7379837036132812 |  {'rouge1': 39.4883, 'rouge2': 17.4178, 'rougeL': 32.125, 'rougeLsum': 32.1032}
Epoch: 3 | train loss: 1.4621787071228027 | eval loss: 1.7115070819854736 |  {'rouge1': 39.6202, 'rouge2': 17.6083, 'rougeL': 32.3555, 'rougeLsum': 32.3352}
Epoch: 4 | train loss: 1.3199303150177002 | eval loss: 1.7229293584823608 |  {'rouge1': 40.2058, 'rouge2': 18.0716, 'rougeL': 32.9105, 'rougeLsum': 32.9249}


In [None]:
# index_text = 3

In [None]:
# small_xsum["test"][index_text]["summary"]

In [None]:
# inference(small_xsum["test"][index_text]["document"])

In [None]:
# unwrapped_model = accelerator.unwrap_model(model)

In [None]:
# unwrapped_model.push_to_hub(repo_name)

In [None]:
# tokenizer.push_to_hub(repo_name)