In [1]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-summarization")
dataset

  from .autonotebook import tqdm as notebook_tqdm
No config specified, defaulting to: arxiv-summarization/section
Found cached dataset arxiv-summarization (C:/Users/Justin Du/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3)
100%|██████████| 3/3 [00:00<00:00,  5.37it/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [2]:
data_train = dataset['train']
data_test = dataset['test']
data_val = dataset['validation']
data_test

Dataset({
    features: ['article', 'abstract'],
    num_rows: 6440
})

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [4]:
max_length = 1024

def tokenize_function(data):

    model_inputs = tokenizer(
        data["article"],
        truncation=True,
        padding=True,
        max_length=max_length
    )

    labels = tokenizer(
        data['abstract'],
        truncation=True,
        padding=True,
        max_length=max_length
    )
    model_inputs["decoder_input_ids"] = labels["input_ids"]
    model_inputs["decoder_attention_mask"] = labels["attention_mask"]

    return model_inputs

In [5]:
tok = data_test.map(tokenize_function, batched=True)
tok

Loading cached processed dataset at C:/Users/Justin Du/.cache/huggingface/datasets/ccdv___arxiv-summarization/section/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3\cache-658232b161c5e4b8.arrow


Dataset({
    features: ['article', 'abstract', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
    num_rows: 6440
})

In [6]:
tok = tok.remove_columns(['abstract', 'article'])

In [7]:
features = [tok[i] for i in range(1000)]

In [18]:
data_collator(features)

{'input_ids': tensor([[    0,  1990,    59,  ...,    32,  4756,     2],
        [    0,   405,    16,  ...,   143,   787,     2],
        [    0,   281,    10,  ...,  1437,  1437,     2],
        ...,
        [    0,  8634,     9,  ...,  1437,  1437,     2],
        [    0,  5481,   111,  ..., 50118,     5,     2],
        [    0, 45355,  2372,  ...,     5,  1368,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[    0,   627,   765,  ...,     1,     1,     1],
        [    0,  1694,   892,  ...,     1,     1,     1],
        [    0, 34845,    31,  ...,     1,     1,     1],
        ...,
        [    0,   879, 10162,  ...,     1,     1,     1],
        [    0,  1694,   266,  ...,     1,     1,     1],
        [    0, 17143, 33463,  ...,     1,     1,     1]]), 'd

In [31]:
import torch
x = features[0]['input_ids']
x = torch.as_tensor([list(x)])

In [32]:
outputs = model.generate(x, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)

In [33]:
print(tokenizer.decode(outputs[0]))

</s><s>The problem of properties of short - term changes of solar activity has been considered extensively. The periodicities about 155 days and from the interval of @xmath3 $ ] days are mentioned most often. The existence of this periodicity is not understood yet. The authors concluded that the length of the period is variable and this period is not the reason of this problem.</s>


In [9]:
# # x = torch.as_tensor(tok['decoder_input_ids'])
# new_tok = tok.with_format(type='torch', columns=['article', 'abstract', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'])

In [11]:
import evaluate
import nltk


nltk.download("punkt")
rouge_score = evaluate.load('rouge')

[nltk_data] Downloading package punkt to C:\Users\Justin
[nltk_data]     Du\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
logging_steps = len(data_test)
training_args = Seq2SeqTrainingArguments(
    output_dir='test-trainer',
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    label_names='decoder_input_ids'
)

In [13]:
import numpy as np
from nltk.tokenize import sent_tokenize


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=features,
    # eval_dataset=features_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1000
  0%|          | 0/1000 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values,encoder_last_hidden_state. For reference, the inputs it received are input_ids,attention_mask,decoder_input_ids,decoder_attention_mask.

In [None]:
# samples = tok['test'][:2000]
# samples = {k: v for k, v in samples.items() if k not in ['abstract', 'article']}
# batch = data_collator(samples)

In [None]:
# chunk = 500
# tok_abs = []

# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_abs.append(tokenizer(i['abstract'], truncation=True, padding=True, return_tensors='tf'))


In [None]:
# tok_art = []
# for i in (data_test[pos:pos + chunk] for pos in range(0, len(data_test), chunk)):
#     tok_art.append(tokenizer(i['article'], truncation=True, ))

In [None]:
# import torch
# tok = torch.cat(tok_abs, dim=1)


In [None]:
# test_tok_abs = tokenizer(data_train['article'], truncation=True, padding='max_length')