In [1]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-summarization")
dataset

No config specified, defaulting to: arxiv-summarization/section
Found cached dataset arxiv-summarization (C:/Users/JustinDu/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [2]:
data_train = dataset['train'].shuffle(seed=42)
data_test = dataset['test'].shuffle(seed=42)
data_val = dataset['validation'].shuffle(seed=42)

In [3]:
data_test = data_test.shard(num_shards=6, index=0)
data_val = data_val.shard(num_shards=24, index=0)

In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
import torch

checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [5]:
max_length = 1024

def tokenize_function(data):

    model_inputs = tokenizer(
        data["article"],
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    labels = tokenizer(
        data['abstract'],
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [6]:
tok = data_test.map(tokenize_function, batched=True)
tok_val = data_val.map(tokenize_function, batched=True)
tok = tok.remove_columns(
    data_test.column_names
)
tok_val = tok_val.remove_columns(
    data_val.column_names
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
import evaluate
import nltk

nltk.download("punkt")
rouge_score = evaluate.load('rouge')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JustinDu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data is done preparing

In [8]:
tok.set_format('torch')
tok_val.set_format('torch')


In [9]:
# from huggingface_hub import notebook_login

# notebook_login()

In [10]:
from transformers import Seq2SeqTrainingArguments, Trainer
training_args = Seq2SeqTrainingArguments(
    output_dir='arxiv-accelerate',
    num_train_epochs=20,
    evaluation_strategy='epoch',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    fp16=True,
)

In [11]:
from nltk.tokenize import sent_tokenize
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [12]:
import torch
import numpy as np
import os
import socket
import torch_optimizer as optim
from accelerate import Accelerator
from accelerate.utils import LoggerType
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from huggingface_hub import Repository
from transformers import get_scheduler

repo = Repository('BARTxiv', clone_from='kworts/BARTxiv')
device = torch.device('cuda')
current_time = datetime.now().strftime("%b%d_%H-%M-%S")
url = current_time + "_" + socket.gethostname()
writer = SummaryWriter(log_dir=f'BARTxiv/runs/{url}')


dataloader = DataLoader(
    tok, 
    batch_size=training_args.per_device_train_batch_size,
    collate_fn=data_collator
    )
dataloader_val = DataLoader(
    tok_val, 
    batch_size=training_args.per_device_train_batch_size,
    collate_fn=data_collator
    )

num_update_steps_per_epoch = len(dataloader)
num_training_steps = int(num_update_steps_per_epoch * training_args.num_train_epochs / training_args.gradient_accumulation_steps)
progress_bar = tqdm(range(num_training_steps))


if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()

optimizer = optim.Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=3e-6)

accelerator = Accelerator(fp16=training_args.fp16)
model, optimizer, dataloader, dataloader_val = accelerator.prepare(model, optimizer, dataloader, dataloader_val)

for epoch in range(training_args.num_train_epochs):
    model.train()
    for step, batch in enumerate(dataloader, start=0):
        loss = model(**batch).loss
        loss = loss / training_args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % training_args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    writer.add_scalar(f'train/loss', loss.item(), epoch)
    model.eval()

    eval_bar = tqdm(range(len(dataloader_val)))

    for step, batch in enumerate(dataloader_val, start=0):

        batch = tuple(b.to(device) for b in batch.values())
        
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch[0],
                attention_mask=batch[1],
                max_new_tokens=1024,
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch[2]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch[2], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

            eval_bar.update(1)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    for key, value in result.items():
        writer.add_scalar(f'eval/{key}', value, epoch)
    print(f"Epoch {epoch}:", result)
    print(f"Epoch {epoch} loss: ", loss)
    # accelerator.save_state('checkpoint') ,

c:\Users\JustinDu\Documents\GitHub\sf2023\BARTxiv is already a clone of https://huggingface.co/kworts/BARTxiv. Make sure you pull the latest changes with `repo.git_pull()`.


  0%|          | 0/21480 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 0: {'rouge1': 39.9374998379588, 'rouge2': 13.601410726666902, 'rougeL': 21.443067586705077, 'rougeLsum': 35.79983961095115}
Epoch 0 loss:  tensor(0.7191, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 1: {'rouge1': 41.16876106851064, 'rouge2': 14.703784631968752, 'rougeL': 22.409495597189608, 'rougeLsum': 36.935915127985524}
Epoch 1 loss:  tensor(0.6636, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 2: {'rouge1': 41.65543775840364, 'rouge2': 14.963702922887496, 'rougeL': 22.691063150059456, 'rougeLsum': 37.39672585628282}
Epoch 2 loss:  tensor(0.5980, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 3: {'rouge1': 41.41129411564528, 'rouge2': 14.87800082561363, 'rougeL': 22.63183365329553, 'rougeLsum': 37.15833697989158}
Epoch 3 loss:  tensor(0.5825, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 4: {'rouge1': 41.72649494936845, 'rouge2': 15.027706038488601, 'rougeL': 22.89953676791904, 'rougeLsum': 37.35662947169073}
Epoch 4 loss:  tensor(0.5630, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 5: {'rouge1': 41.59445622557503, 'rouge2': 14.739428209651495, 'rougeL': 22.81667528958453, 'rougeLsum': 37.076457384638296}
Epoch 5 loss:  tensor(0.5400, device='cuda:0', grad_fn=<DivBackward0>)


  0%|          | 0/269 [00:00<?, ?it/s]

Epoch 6: {'rouge1': 41.69963870827386, 'rouge2': 14.913768587731429, 'rougeL': 22.862963375002188, 'rougeLsum': 37.367994925166094}
Epoch 6 loss:  tensor(0.5319, device='cuda:0', grad_fn=<DivBackward0>)


KeyboardInterrupt: 

In [13]:
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained('BARTxiv', save_function=accelerator.save)
tokenizer.save_pretrained('BARTxiv')
repo.git_pull()
repo.push_to_hub(commit_message=f'Training completed ({len(tok)} examples, {epoch + 1} epochs)')

Clean file pytorch_model.bin:   0%|          | 1.00k/1.51G [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 1.00/1.51G [00:00<?, ?B/s]

Upload file runs/Mar22_14-28-05_DESKTOP-4P0HGA6/events.out.tfevents.1679520485.DESKTOP-4P0HGA6.20104.0:   0%| …

To https://huggingface.co/kworts/BARTxiv
   31db272..2e3bbf0  main -> main

   31db272..2e3bbf0  main -> main



'https://huggingface.co/kworts/BARTxiv/commit/2e3bbf043fe0d9c6b1845a312ced0299712bcb35'

In [14]:
from huggingface_hub import ModelCard, ModelCardData, EvalResult

card_data = ModelCardData(
    language='en',
    license='mit',
    tags=['summarization', 'bart'],
    datasets='ccdv/arxiv-summarization',
    library_name='transformers',
    eval_results=[
        EvalResult(
            task_type='summarization',
            dataset_type='ccdv/arxiv-summarization',
            dataset_name='arxiv-summarization',
            metric_type='rouge1',
            metric_value=result['rouge1'].item(),
            dataset_split='validation'
        ),
        EvalResult(
            task_type='summarization',
            dataset_type='ccdv/arxiv-summarization',
            dataset_name='arxiv-summarization',
            metric_type='rouge2',
            metric_value=result['rouge2'].item(),
            dataset_split='validation'
        ),
    ],
    model_name='BARTxiv',
)
card = ModelCard.from_template(card_data)

In [15]:
card.push_to_hub(commit_message='update model card', repo_id='kworts/BARTxiv')

'https://huggingface.co/kworts/BARTxiv/blob/main/README.md'