In [1]:
from transformers import AutoTokenizer
from datasets import load_from_disk
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoConfig
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
billsum = load_from_disk("billsum")['train']
billsum = billsum.remove_columns(['Unnamed: 0', 'title'])

In [3]:
billsum = billsum.train_test_split(test_size=0.2)
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 3790
    })
})

In [4]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    # Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
# T5ForConditionalGeneration forward函数函数签名:
'''
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    decoder_input_ids: Optional[torch.LongTensor] = None,
    decoder_attention_mask: Optional[torch.BoolTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    decoder_head_mask: Optional[torch.FloatTensor] = None,
    cross_attn_head_mask: Optional[torch.Tensor] = None,
    encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
'''
tokenized_billsum = billsum.map(preprocess_function, batched=True)
tokenized_billsum

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3790
    })
})

In [7]:
# 相当于torch.utils.data.DataLoader中collate_fn的作用(可以重写,参考K_demo/way_of_training/pytorch_transformer.ipynb)
# Data collator that will dynamically pad the inputs received, as well as the labels.
'''
model ([`PreTrainedModel`]):
    The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
    prepare the *decoder_input_ids*

    This is useful when using *label_smoothing* to avoid calculating loss twice.
'''
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
rouge = evaluate.load("rouge")
rouge.description

'ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for\nevaluating automatic summarization and machine translation software in natural language processing.\nThe metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.\n\nNote that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.\n\nThis metrics is a wrapper around Google Research reimplementation of ROUGE:\nhttps://github.com/google-research/google-research/tree/master/rouge\n'

In [9]:
def compute_metrics(eval_pred):
    # predictions.shape=[batch_size, max(该批次生成句子长度)]
    # labels.shape=[batch_size, max(该批次句子长度)]
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [10]:
config = AutoConfig.from_pretrained(checkpoint)
config

'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /t5-small/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5ed3f27520>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json


T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefi

In [11]:
# 从头训练
model = AutoModelForSeq2SeqLM.from_config(config)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model_t5",
    save_total_limit=1,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=10,

    # 区别于TrainingArguments特有参数:
    # predict_with_generate (bool, optional, defaults to False) — Whether to use generate to calculate generative metrics (ROUGE, BLEU).
    predict_with_generate=True,
    # generation_max_length (int, optional) — The max_length to use on each evaluation loop when predict_with_generate=True. Will default to the max_length value of the model configuration.
    generation_max_length=32,  # 生成的最大长度
    # generation_num_beams (int, optional) — The num_beams to use on each evaluation loop when predict_with_generate=True. Will default to the num_beams value of the model configuration.
    generation_num_beams=4  # 集束搜索
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15159
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9480
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,7.8561,7.113714,0.169,0.0498,0.1525,0.1526,19.0
1000,7.0732,6.748777,0.1606,0.0489,0.1382,0.1383,19.0
1500,6.7803,6.520914,0.1524,0.0505,0.1351,0.1351,19.0
2000,6.5956,6.331944,0.1411,0.045,0.1251,0.1251,19.0
2500,6.4164,6.184151,0.157,0.0484,0.1358,0.1358,19.0
3000,6.2709,6.062525,0.1743,0.0614,0.1485,0.1485,19.0
3500,6.1622,5.950559,0.152,0.0521,0.1319,0.1319,19.0
4000,6.0585,5.857097,0.1621,0.0572,0.1393,0.1393,18.9979
4500,5.965,5.774234,0.1226,0.0449,0.1077,0.1077,19.0
5000,5.8862,5.702372,0.1424,0.054,0.1251,0.125,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If summary, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3790
  Batch size = 16
Saving model checkpoint to my_awesome_billsum_model_t5/checkpoint-500
Configuration saved in my_awesome_billsum_model_t5/checkpoint-500/config.json
Model weights saved in my_awesome_billsum_model_t5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in my_awesome_billsum_model_t5/checkpoint-500/tokenizer_config.json
Special tokens file saved in my_awesome_billsum_model_t5/checkpoint-500/special_tokens_map.json
Copy vocab file to my_awesome_billsum_model_t5/checkpoint-500/spiece.model
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, text. If

TrainOutput(global_step=9480, training_loss=6.101312281612605, metrics={'train_runtime': 5857.4807, 'train_samples_per_second': 25.88, 'train_steps_per_second': 1.618, 'total_flos': 4.103292737028096e+16, 'train_loss': 6.101312281612605, 'epoch': 10.0})

In [13]:
# T5ForConditionalGeneration
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [17]:
# inference
# generate具体参数参考:huggingface GenerationConfig类
inputs = tokenizer(
    "summarize: Shields a business entity from civil liability relating to any injury or death occurring at a facility of that entity in connection with a use of such facility by a nonprofit organization if: (1) the use occurs outside the scope of business of the business entity; (2) such injury or death occurs during a period that such facility is used by such organization; and (3) the business entity authorized the use of such facility by the organization. Makes this Act inapplicable to an injury or death that results from an act or omission of a business entity that constitutes gross negligence or intentional misconduct, including misconduct that: (1) constitutes a hate crime or a crime of violence or act of international terrorism for which the defendant has been convicted in any court; or (2) involves a sexual offense for which the defendant has been convicted in any court or misconduct for which the defendant has been found to have violated a Federal or State civil rights law. Preempts State laws to the extent that such laws are inconsistent with this Act, except State law that provides additional protection from liability. Specifies that this Act shall not be construed to supersede any Federal or State health or safety law. Makes this Act inapplicable to any civil action in a State court against a business entity in which all parties are citizens of the State if such State, citing this Act's authority and containing no other provision, enacts a statute declaring the State's election that this Act shall not apply to such action in the State.",
    return_tensors="pt").input_ids
inputs = inputs.to(model.device)
'''
repetition_penalty (`float`, *optional*, defaults to 1.0):
    The parameter for repetition penalty. 1.0 means no penalty. See [this
    paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
'''
outputs = model.generate(inputs,
                         repetition_penalty=1.0,
                         # The minimum length of the sequence to be generated.默认min_length=0
                         min_length=0,
                         # The maximum length the generated tokens can have.默认max_length=20
                         max_length=50)  # 贪心搜索
outputs

tensor([[   0,  736,  989,    7,    8, 5034, 4336, 1081,   12,    3,    9, 1015,
            3,    9,  568,    3,    9,    3,    9,    3,    9,    3,    9, 1015,
           42,    3,    9,    3,    9,    3,    9,    3,    9,    3,    9,    3,
            9,    3,    9,    3,    9,    3,    9,    3,    9,  568,    3,    9,
          568,   24]], device='cuda:0')

In [18]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Amends the Federal criminal code to a State a person a a a a State or a a a a a a a a a a person a person that


In [19]:
# 集束搜索:选择了概率最大的前k个(本质上也是贪心的思想,只不过它考虑了更多的候选搜索空间,因此可以得到更多的生成结果)
'''
length_penalty (`float`, *optional*, defaults to 1.0):
    Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
    the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
    likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
    `length_penalty` < 0.0 encourages shorter sequences.
'''
print(tokenizer.decode(model.generate(inputs,
                                      length_penalty=1.0,
                                      # Number of beams for beam search. 1 means no beam search.默认num_beams=1
                                      num_beams=5)[0], skip_special_tokens=True))

(Sec. 3) Directs the Secretary of the Interior to provide for a
