In [1]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-summarization")
dataset

  from .autonotebook import tqdm as notebook_tqdm
No config specified, defaulting to: arxiv-summarization/section
Found cached dataset arxiv-summarization (C:/Users/Justin Du/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3)
100%|██████████| 3/3 [00:00<00:00,  5.33it/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [2]:
from datasets.dataset_dict import DatasetDict

data_train = dataset['train']
data_test = DatasetDict({'validation': dataset['validation'], 'test': dataset['test']})

In [3]:
data_test

DatasetDict({
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [4]:
from transformers import AutoTokenizer

checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
max_length = 1024

def tokenize_function(data):
    model_inputs = tokenizer(
        data["article"],
        truncation=True,
        max_length=max_length
    )

    labels = tokenizer(
        data['abstract'],
        truncation=True,
        max_length=max_length
    )

    model_inputs["decoder_input_ids"] = labels["input_ids"]
    model_inputs["decoder_attention_mask"] = labels["attention_mask"]
    return model_inputs

In [6]:
tok = data_test.map(tokenize_function, batched=True)

 86%|████████▌ | 6/7 [00:24<00:04,  4.11s/ba]
 86%|████████▌ | 6/7 [00:24<00:04,  4.08s/ba]


In [7]:
tok

DatasetDict({
    validation: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 6440
    })
})

In [8]:
import evaluate
import nltk
rouge_score = evaluate.load('rouge')
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Justin
[nltk_data]     Du\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [10]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
logging_steps = len(data_test["validation"])
training_args = Seq2SeqTrainingArguments(
    output_dir='test-trainer',
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps
)

In [11]:
from transformers import AutoModel
model = AutoModel.from_pretrained(checkpoint)

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tok['test'],
    eval_dataset=tok['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartModel.forward` and have been ignored: abstract, article. If abstract, article are not expected by `BartModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6440
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6440
  0%|          | 0/6440 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`decoder_input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# samples = tok['test'][:2000]
# samples = {k: v for k, v in samples.items() if k not in ['abstract', 'article']}
# batch = data_collator(samples)

In [None]:
# chunk = 500
# tok_abs = []

# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_abs.append(tokenizer(i['abstract'], truncation=True, padding=True, return_tensors='tf'))


In [None]:
# tok_art = []
# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_art.append(tokenizer(i['article'], truncation=True, ))

In [None]:
# import torch
# tok = torch.cat(tok_abs, dim=1)


In [None]:
# test_tok_abs = tokenizer(data_train['article'], truncation=True, padding='max_length')