In [1]:
import glob, os

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from pathlib import Path
from tqdm import tqdm

import easydict

from finetune import SummarizationTrainer

In [2]:
args = easydict.EasyDict({
    "adam_epsilon":1e-08,
    "cache_dir":'',
    "config_name":'',
    "data_dir":'./data/cnn_dm/cnn_dm',
    "do_lower_case":False,
    "do_predict":False,
    "do_train":True,
    "eval_batch_size":4,
    "fp16":False,
    "fp16_opt_level":'O1',
    "gradient_accumulation_steps":1,
    "learning_rate":3e-05,
    "max_grad_norm":1.0,
    "max_source_length":1024,
    "max_target_length":56,
    "model_name_or_path":'t5-small',
    "model_type":'t5',
    "n_gpu":1,
    "n_tpu_cores":0,
    "num_train_epochs":3,
    "output_dir":'./output',
    "seed":42,
    "server_ip":'',
    "server_port":'',
    "tokenizer_name":'',
    "train_batch_size":4,
    "warmup_steps":0,
    "weight_decay":0.0
})

In [3]:
trainer = SummarizationTrainer(args)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json from cache at C:\Users\Daeyoung\.cache\torch\transformers\26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.75aa1adfaef9acd7cb4b5e3aca5831c9b362b5940c2cc86a1f2dc353283117b0
INFO:transformers.configuration_utils:Model config T5Config {
  "_num_labels": 2,
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "bad_words_ids": null,
  "bos_token_id": null,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "do_sample": false,
  "dropout_rate": 0.1,
  "early_stopping": false,
  "eos_token_id": 1,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_factor": 1.0,
  "is_decoder": false,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 1.0,
  "max_length": 20,
  "min_length": 0,
 

In [4]:
# Finetuned weight
checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpointepoch=*.ckpt"), recursive=True)))
checkpoints

['./output\\checkpointepoch=0.ckpt',
 './output\\checkpointepoch=1.ckpt',
 './output\\checkpointepoch=2.ckpt']

In [5]:
text_input = './data/CORD19/abstract_ndq.txt'

source_lns = [x.rstrip() for x in open(text_input, encoding="UTF8").readlines()]

In [7]:
device = "cuda"

for i in range(150):
    dct = trainer.tokenizer.batch_encode_plus([source_lns[i]], max_length=512, return_tensors="pt", pad_to_max_length=True)
    input_ids = dct["input_ids"].to(device)
    dec = [trainer.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in input_ids][0]
    src = source_lns[i]
    if len(src) - len(dec) > 15:
        print(i, end=" ")
        print("[diff]: ", len(src)-len(dec))
        del source_lns[i]

print(len(source_lns))

8 [diff]:  227
13 [diff]:  777
15 [diff]:  300
30 [diff]:  248
35 [diff]:  1263
128 [diff]:  255
130 [diff]:  231
131 [diff]:  34
35798


In [8]:
source_lns = source_lns[:100]

In [9]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

In [10]:
# From Not Finetuned Model
trainer.model.to(device)

batch_size = 8
max_length = 140
min_length = 55

output_file = Path('./data/CORD19/no_ft_generated_summary.txt').open("w", encoding="UTF8")
for batch in tqdm(list(chunks(source_lns, batch_size))):
    batch = ["summarize: " + text for text in batch]
    
    dct = trainer.tokenizer.batch_encode_plus(batch, max_length=512, return_tensors="pt", pad_to_max_length=True)
    input_ids = dct["input_ids"].to(device)
    attention_mask = dct["attention_mask"].to(device)
    
    summaries = trainer.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=4,
        length_penalty=2.0,
        max_length=max_length + 2,  # +2 from original because we start at step=1 and stop before max_length
        min_length=min_length + 1,  # +1 from original because we start at step=1
        no_repeat_ngram_size=3,
        early_stopping=True,
    )
    dec = [trainer.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    
    for hypothesis in dec:
        output_file.write('.'.join(hypothesis.split('.')[:-1]) + '.' + "\n")
        output_file.flush()

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [02:21<00:00, 10.89s/it]


In [11]:
# From finetuned model
for i, ckpt in enumerate(checkpoints):
    trainer = SummarizationTrainer.load_from_checkpoint(checkpoints[i])
    trainer.model.to("cuda")
    
    output_file = Path(f"./data/CORD19/ft_ckpt_{str(i)}_generated_summary.txt").open("w", encoding="UTF8")
    for batch in tqdm(list(chunks(source_lns, batch_size))):
        batch = ["summarize: " + text for text in batch]

        dct = trainer.tokenizer.batch_encode_plus(batch, max_length=512, return_tensors="pt", pad_to_max_length=True)
        input_ids = dct["input_ids"].to(device)
        attention_mask = dct["attention_mask"].to(device)

        summaries = trainer.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=4,
            length_penalty=2.0,
            max_length=max_length + 2,  # +2 from original because we start at step=1 and stop before max_length
            min_length=min_length + 1,  # +1 from original because we start at step=1
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
        dec = [trainer.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]

        for hypothesis in dec:
            output_file.write('.'.join(hypothesis.split('.')[:-1]) + '.' + "\n")
            output_file.flush()

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json from cache at C:\Users\Daeyoung\.cache\torch\transformers\26561bc9e840d8945f475d0d4c4b9df32025eadd79894b867b570cb1d09e67a9.75aa1adfaef9acd7cb4b5e3aca5831c9b362b5940c2cc86a1f2dc353283117b0
INFO:transformers.configuration_utils:Model config T5Config {
  "_num_labels": 2,
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "bad_words_ids": null,
  "bos_token_id": null,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "do_sample": false,
  "dropout_rate": 0.1,
  "early_stopping": false,
  "eos_token_id": 1,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_factor": 1.0,
  "is_decoder": false,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 1.0,
  "max_length": 20,
  "min_length": 0,
 