In [2]:
import pandas as pd
from tqdm import tqdm
import torch
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
dataset_name = 'dmacres/mimiciii-hospitalcourse-cossim-pagerank-batched-extractive-summ-v2'
mimiciii_dataset = load_dataset(dataset_name)
mimiciii_dataset

Downloading readme:   0%|          | 0.00/886 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/107M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/5356 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5356 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/24993 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes'],
        num_rows: 5356
    })
    test: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes'],
        num_rows: 5356
    })
    train: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes'],
        num_rows: 24993
    })
})

In [6]:
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [7]:
model_ckpt = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [8]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               note_text="extractive_notes_summ",
                               note_summary="target_text"):
    note_batches = list(chunks(dataset[note_text], batch_size))
    target_batches = list(chunks(dataset[note_summary], batch_size))

    for note_batch, target_batch in tqdm(
        zip(note_batches, target_batches), total=len(note_batches)):

        inputs = tokenizer(note_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [9]:
#hide_output
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["extractive_notes_summ"], max_length=1024,
                                truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["target_text"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}


In [10]:

columns = ["input_ids", "attention_mask", "labels"]

mimiciii_dataset_pt = mimiciii_dataset.map(convert_examples_to_features,
                                       batched=True)

mimiciii_dataset_pt.set_format(type="torch", columns=columns)


Map:   0%|          | 0/5356 [00:00<?, ? examples/s]



Map:   0%|          | 0/5356 [00:00<?, ? examples/s]

Map:   0%|          | 0/24993 [00:00<?, ? examples/s]

In [11]:
mimiciii_dataset_pt

DatasetDict({
    validation: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5356
    })
    test: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5356
    })
    train: Dataset({
        features: ['subject_id', 'hadm_id', 'target_text', 'extractive_notes_summ', 'n_notes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 24993
    })
})

In [12]:
mimiciii_dataset_pt['train'][0]

{'input_ids': tensor([  353,  2085,   181,  ..., 10373, 11301,     1]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'labels': tensor([15216,   140,  6305,   112,   109, 87903,   292,   111, 18793,   142,
         39979,   840, 86106, 54284,   122, 10053,   113,   169,  2672,   252,
         17696, 55256,   107,  2355, 15362,   445,   178,   140,  5766,   112,
           109,  7934,   406,  1451,  4026, 28307,  8389,   111,  3908,   107,
           285,   140,  4099,   124,  4218, 30517,   386,   118, 23725, 74892,
           108,   169,  1458,  1367,   140,  3922,   111,   178,   140, 10129,
           122,   686, 12263,  5723,   107,   285,   140,   163,  4099,   124,
         17354,   454,   134, 33823,   118,  5498,   113,  3170,  9644,  2635,
          1532,   131,   116,  1225,  3818,   689,   107,   202, 42459,  6773,
          5499,  5994, 42459,   852,   111,   142,  2757,   115,   169,  2672,
           252, 17696,   107,   285,  4615,  3908,   108,   169,  2330,   1

In [13]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='../pegasus-large-mimiciii-v2', num_train_epochs=3, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=True,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [15]:
# hide_output
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=mimiciii_dataset_pt['train'],
                  eval_dataset=mimiciii_dataset_pt['validation'])

In [16]:
# hide_output
trainer.train()


You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,3.5001,3.344688
1000,3.1713,3.076767
1500,3.1364,2.957438
2000,3.1029,2.892024
2500,2.775,2.845342
3000,2.8836,2.811464
3500,2.9035,2.79142
4000,2.836,2.779357
4500,2.777,2.772719


TrainOutput(global_step=4686, training_loss=3.100408427897448, metrics={'train_runtime': 21276.3841, 'train_samples_per_second': 3.524, 'train_steps_per_second': 0.22, 'total_flos': 2.0227264751569306e+17, 'train_loss': 3.100408427897448, 'epoch': 3.0})

In [17]:
trainer.push_to_hub("Training complete!")

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

events.out.tfevents.1700053754.dae270b4f28d.730.0:   0%|          | 0.00/82.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

'https://huggingface.co/dmacres/pegasus-large-mimiciii-v2/tree/main/'

In [18]:
model_ckpt_cust = "../pegasus-large-mimiciii-v2"
tokenizer_cust = AutoTokenizer.from_pretrained(model_ckpt)
model_cust = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

test_sample = mimiciii_dataset['test'].shuffle(seed = 42).select([3])
test_sample_text = test_sample['extractive_notes_summ']
print(test_sample_text)
test_sample_target = test_sample['target_text']
print('\n\n\n\n')
print(test_sample_target)

inputs = tokenizer_cust(test_sample_text, max_length=1024,  truncation=True,
                padding="max_length", return_tensors="pt")


summaries = model_cust.generate(input_ids=inputs["input_ids"].to(device),
                 attention_mask=inputs["attention_mask"].to(device),
                 length_penalty=0.8, num_beams=8, max_length=1024)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['There is some patchy opacity at the right lung base -- ? Again seen is opacification of left hemithorax. Sternotomy wires and riht paratracheal/suprahilar sutures are noted. Possible prior inferior myocardial infarction. Premature ventricularcontractions. Opacity at the right cardiophrenic angle could reflect a small effusion. The left hemithorax is opacified, with, as noted, shift of the mediastinum. FINDINGS:  The endotracheal tube, NG tube, right central line and the left pneumonectomy site appear unchanged. The extreme right costophrenic angle is excluded from the film. chest, 1 vw The patient is status post sternotomy. NG tube present, tip extending beneath diaphragm off film. Rotated positioning, which limits assessment of the central line tip. An NG tube is present, tip extending beneath the diaphragm. The right chest shows some atelectasis, but is otherwise grossly clear. A right IJ central line is present, tip probably overlies the SVC, though difficult to confirm due to lef

In [19]:
decoded_summaries = [tokenizer_cust.decode(s, skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
       for s in summaries]
decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
decoded_summaries

['FINDINGS: The endotracheal tube, NG tube, right central line and the left pneumonectomy site appear unchanged. NG tube present, tip extending beneath diaphragm off film. A right IJ central line is present, tip probably overlies the SVC, though difficult to confirm due to leftward shift of the mediastinum and rotated positioning. IMPRESSION: 1) Right IJ central line tip may overlie the distal SVC, but exact location cannot be confirmed on this film due to distortion of usual thoracic anatomy.']

In [20]:
score = evaluate_summaries_pegasus(
    mimiciii_dataset['test'], rouge_metric, trainer.model, tokenizer,
    batch_size=2, note_text="extractive_notes_summ", note_summary="target_text")

rouge_methods = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_dict = dict((rm, score[rm].mid.fmeasure) for rm in rouge_methods)
pd.DataFrame(rouge_dict, index=[f"pegasus-large"])

100%|██████████| 2678/2678 [3:06:09<00:00,  4.17s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus-large,0.004913,0.002295,0.003949,0.003925


In [21]:
rouge_methods = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_dict = dict((rm, score[rm].mid.fmeasure) for rm in rouge_methods)
pd.DataFrame(rouge_dict, index=[f"pegasus-large"]).to_csv('pegasus-large-mimiciii-v2-rogue-metrics.csv', index = False)