In [29]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-summarization")
dataset

100%|██████████| 3/3 [00:00<00:00,  4.07it/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [30]:
data_train = dataset['train']
data_test = dataset['test']
data_val = dataset['validation']
data_test

Dataset({
    features: ['article', 'abstract'],
    num_rows: 6440
})

In [31]:
data_test = data_test.shard(num_shards=644, index=0)
data_val = data_val.shard(num_shards=644, index=0)

In [32]:
data_test

Dataset({
    features: ['article', 'abstract'],
    num_rows: 10
})

In [33]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [34]:
max_length = 1024

def tokenize_function(data):

    model_inputs = tokenizer(
        data["article"],
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    labels = tokenizer(
        data['abstract'],
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels_mask"] = labels["attention_mask"]

    return model_inputs

In [35]:
tok = data_test.map(tokenize_function, batched=True)
tok_val = data_val.map(tokenize_function, batched=True)
tok



Dataset({
    features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels', 'labels_mask'],
    num_rows: 10
})

In [36]:
tok = tok.remove_columns(
    data_test.column_names
)

tok_val = tok_val.remove_columns(
    data_val.column_names
)

In [None]:
# features = [tok[i] for i in range(1000)]
# features_val = [tok_val[i] for i in range(len(tok_val))]

In [None]:
# data_collator(features)

In [None]:
# import torch
# x = features[0]['input_ids']
# x = torch.as_tensor([list(x)])

In [None]:
# outputs = model.generate(x, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)

In [None]:
# print(tokenizer.decode(outputs[0]))

In [None]:
# # x = torch.as_tensor(tok['decoder_input_ids'])
# new_tok = tok.with_format(type='torch', columns=['article', 'abstract', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'])

In [37]:
import evaluate
import nltk


nltk.download("punkt")
rouge_score = evaluate.load('rouge')

[nltk_data] Downloading package punkt to C:\Users\Justin
[nltk_data]     Du\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
from nltk.tokenize import sent_tokenize

def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [39]:
print(three_sentence_summary(data_train[2]["article"]))

the transport properties of nonlinear non - equilibrium dynamical systems are far from well - understood@xcite .
consider in particular so - called ratchet systems which are asymmetric periodic potentials where an ensemble of particles experience directed transport@xcite .
the origins of the interest in this lie in considerations about extracting useful work from unbiased noisy fluctuations as seems to happen in biological systems@xcite .


In [40]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["article"]]
    return metric.compute(predictions=summaries, references=dataset["abstract"])

In [41]:
import pandas as pd

score = evaluate_baseline(data_val, rouge_score)


In [42]:
score

{'rouge1': 0.25354994840029366,
 'rouge2': 0.06661585401242887,
 'rougeL': 0.1414135166070295,
 'rougeLsum': 0.22079415649171222}

In [43]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn]* 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 25.35, 'rouge2': 6.66, 'rougeL': 14.14, 'rougeLsum': 22.08}

In [62]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 4
logging_steps = len(data_test)
training_args = Seq2SeqTrainingArguments(
    output_dir='test-trainer',
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    label_names=['labels']
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [57]:
import numpy as np
from nltk.tokenize import sent_tokenize


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value* 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [65]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tok,
    eval_dataset=tok_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [59]:
print(tok)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'labels_mask'],
    num_rows: 10
})


In [66]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8
 25%|██▌       | 2/8 [04:17<11:23, 113.90s/it]The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8

[A
[A
                                              

 25%|██▌       | 2/8 [07:30<11:23, 113.90s/it]


{'eval_loss': 4.822267055511475, 'eval_rouge1': 36.8509, 'eval_rouge2': 10.7281, 'eval_rougeL': 19.3447, 'eval_rougeLsum': 33.3997, 'eval_runtime': 193.2569, 'eval_samples_per_second': 0.052, 'eval_steps_per_second': 0.01, 'epoch': 1.0}


 50%|█████     | 4/8 [11:33<10:53, 163.33s/it]The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8

[A
[A
                                              

 50%|█████     | 4/8 [15:00<10:53, 163.33s/it]
[A
[A

{'eval_loss': 3.1498770713806152, 'eval_rouge1': 32.9526, 'eval_rouge2': 9.3428, 'eval_rougeL': 18.597, 'eval_rougeLsum': 30.7218, 'eval_runtime': 206.5436, 'eval_samples_per_second': 0.048, 'eval_steps_per_second': 0.01, 'epoch': 2.0}


 75%|███████▌  | 6/8 [19:03<06:01, 180.68s/it]The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8

[A
[A
                                              

 75%|███████▌  | 6/8 [22:28<06:01, 180.68s/it]
[A
[A

{'eval_loss': 2.4779257774353027, 'eval_rouge1': 34.2367, 'eval_rouge2': 9.8346, 'eval_rougeL': 18.3263, 'eval_rougeLsum': 31.7046, 'eval_runtime': 205.1718, 'eval_samples_per_second': 0.049, 'eval_steps_per_second': 0.01, 'epoch': 3.0}


100%|██████████| 8/8 [26:46<00:00, 190.43s/it]The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8

[A
[A
                                              

100%|██████████| 8/8 [30:11<00:00, 190.43s/it]
[A
[A

Training completed. Do not forget to share your model on huggingface.co/models =)



100%|██████████| 8/8 [30:11<00:00, 226.49s/it]

{'eval_loss': 2.208855390548706, 'eval_rouge1': 35.526, 'eval_rouge2': 11.3758, 'eval_rougeL': 20.0079, 'eval_rougeLsum': 32.9457, 'eval_runtime': 204.8976, 'eval_samples_per_second': 0.049, 'eval_steps_per_second': 0.01, 'epoch': 4.0}
{'train_runtime': 1812.0976, 'train_samples_per_second': 0.022, 'train_steps_per_second': 0.004, 'train_loss': 3.8960304260253906, 'epoch': 4.0}





TrainOutput(global_step=8, training_loss=3.8960304260253906, metrics={'train_runtime': 1812.0976, 'train_samples_per_second': 0.022, 'train_steps_per_second': 0.004, 'train_loss': 3.8960304260253906, 'epoch': 4.0})

In [67]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8
100%|██████████| 2/2 [00:49<00:00, 24.66s/it]


{'eval_loss': 2.208855390548706,
 'eval_rouge1': 35.526,
 'eval_rouge2': 11.3758,
 'eval_rougeL': 20.0079,
 'eval_rougeLsum': 32.9457,
 'eval_runtime': 210.1974,
 'eval_samples_per_second': 0.048,
 'eval_steps_per_second': 0.01,
 'epoch': 4.0}

In [75]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

In [None]:
# samples = tok['test'][:2000]
# samples = {k: v for k, v in samples.items() if k not in ['abstract', 'article']}
# batch = data_collator(samples)

In [None]:
# chunk = 500
# tok_abs = []

# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_abs.append(tokenizer(i['abstract'], truncation=True, padding=True, return_tensors='tf'))


In [None]:
# tok_art = []
# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_art.append(tokenizer(i['article'], truncation=True, ))

In [None]:
# import torch
# tok = torch.cat(tok_abs, dim=1)


In [None]:
# test_tok_abs = tokenizer(data_train['article'], truncation=True, padding='max_length')