In [21]:
import os, warnings, torch, json, random, gc
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    pipeline
)
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    TaskType,
    PeftModel,
    PeftConfig
)
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document

warnings.filterwarnings("ignore")

os.environ["LLM_REPOSITORY"] = "google/flan-t5-small"
os.environ["TOKENIZER_REPOSITORY"] = "google/flan-t5-small"
os.environ["MAX_TOKENS"] = "4096"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["TOKENS_DATA_PATH"] = "data/doc_summary_tokens"
os.environ["SUMMARY_DATA_PATH"] = "data/doc_summary_pair.json"
os.makedirs(os.environ["DATASET_PATH"], exist_ok=True)
os.makedirs(os.environ["TOKENS_DATA_PATH"], exist_ok=True)

In [2]:
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.1

with open(os.environ["SUMMARY_DATA_PATH"]) as f:
    doc_summary_data = json.load(f)
f.close()

train_size = int(len(doc_summary_data) * TRAIN_SIZE)
val_size = int(len(doc_summary_data) * VALIDATION_SIZE)
test_size = int(len(doc_summary_data) * TEST_SIZE)

train_data = doc_summary_data[:train_size]
val_data = doc_summary_data[train_size:train_size+val_size]
test_data = doc_summary_data[train_size+val_size:]

data_list = [
    ("train", train_data),
    ("validation", val_data),
    ("test", test_data),
]

for data_tuple in data_list:
    with open(os.path.join(os.environ["DATASET_PATH"], f"{data_tuple[0]}.json"), "w") as f:
        json.dump(data_tuple[1], f, indent=4)
    f.close()

del doc_summary_data, train_data, val_data, test_data, data_list, data_tuple, train_size, val_size, test_size
gc.collect()

99

In [3]:
dataset = load_dataset(path=os.environ["DATASET_PATH"])
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")
dataset["train"][random.randint(0, len(dataset["train"]))]

Downloading and preparing dataset json/doc_summary_data to /home/ubuntu/.cache/huggingface/datasets/json/doc_summary_data-a190e415bec51eec/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/doc_summary_data-a190e415bec51eec/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 1507
Validation dataset size: 188
Test dataset size: 189


{'id': 133,
 'document': 'PURPOSE: This randomized phase III trial is studying chemotherapy and pelvic radiation therapy to see how well they work when given with or without additional chemotherapy in treating patients with high-risk early-stage cervical cancer after radical hysterectomy.\nStudy Results: NO\nConditions: Cervical Cancer\nInterventions: DRUG: carboplatin|DRUG: cisplatin|DRUG: paclitaxel\nPrimary Outcome Measures: Disease-free survival, From randomization to date of first failure (local, regional, or distant metastases failure or death due to any cause) or last follow-up. Analysis occurs after 43 disease-free survival failure events on Cisplatin/RT Arm.\nSecondary Outcome Measures: Overall survival, From randomization to date of death or last follow-up. Analysis occurs after all patients have been potentially followed for 4 years.|Chemotherapy-induced neuropathy as measured by FACT-GOG/NTX4, From completion of concurrent chemoradiation to 12 months.|Quality of life as mea

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    os.environ["TOKENIZER_REPOSITORY"],
    model_max_length=int(os.environ["MAX_TOKENS"])
)

In [5]:
concatenated_dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)
tokenized_inputs = concatenated_dataset.map(
    lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])

tokenized_targets = concatenated_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])

max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

Max source length: 1363
Max target length: 218


In [6]:
def preprocess_function(sample, max_source_length: int, max_target_length: int, padding: str="max_length"):
    inputs = [f"summarize this document: {item}"  for item in sample["document"]]

    model_inputs = tokenizer(
        inputs, 
        max_length=max_source_length, 
        padding=padding, 
        truncation=True,
    )
    labels = tokenizer(
        text_target=sample["summary"], 
        max_length=max_target_length,
        padding=padding, 
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else tokenizer.pad_token_id) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_lambda = lambda dataset : preprocess_function(dataset, max_source_length, max_target_length)
tokenized_dataset = dataset.map(preprocess_lambda, batched=True, remove_columns=["document", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "train"))
tokenized_dataset["validation"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "validation"))
tokenized_dataset["test"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "test"))

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/1507 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/188 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/189 [00:00<?, ? examples/s]

In [7]:
# define model
model = AutoModelForSeq2SeqLM.from_pretrained(
    pretrained_model_name_or_path=os.environ["LLM_REPOSITORY"],
    load_in_8bit=True,
    device_map="auto"
)

In [8]:
# Define LoRA Config 
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862001038515747


In [9]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = tokenizer.pad_token_id

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [10]:
OUTPUT_DIR = F"lora-{os.environ['LLM_REPOSITORY'].split('/')[-1]}"
NUM_EPOCHS = 5
LEARNING_RATE = 1e-3

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
	auto_find_batch_size=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # to be set to True for inference

# finetune model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.2136


TrainOutput(global_step=945, training_loss=2.578261731667493, metrics={'train_runtime': 619.1577, 'train_samples_per_second': 12.17, 'train_steps_per_second': 1.526, 'total_flos': 3785013953740800.0, 'train_loss': 2.578261731667493, 'epoch': 5.0})

In [11]:
# Save our LoRA model & tokenizer results
PEFT_MODEL_ID="finetuned_results"
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

('finetuned_results/tokenizer_config.json',
 'finetuned_results/special_tokens_map.json',
 'finetuned_results/spiece.model',
 'finetuned_results/added_tokens.json',
 'finetuned_results/tokenizer.json')

In [15]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, device_map="auto")

In [20]:
# switch model to eval mode
model.eval()

# define model pipeline
hgf_pipeline = pipeline(
    task="text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    temperature=0.1, 
    max_length=int(os.environ["MAX_TOKENS"]),
    top_p=0.15,
    top_k=0,
    repetition_penalty=1.1,
)

llm = HuggingFacePipeline(pipeline=hgf_pipeline)

The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [25]:
summary_chain = load_summarize_chain(llm, chain_type="map_reduce")

# summarise first 5 documens in the testing data
n_ducs = 5
for i, document in enumerate(dataset["test"]["document"][:n_ducs]):
    document = Document(page_content=document)
    print(f"Document: {document}\n")
    print(f"SUMARY: {summary_chain.run([document])}\n\n")

Document: page_content='NCT Number: NCT02289898\nStudy Title: Study of Gemcitabine, Abraxane® Plus Placebo Versus Gemcitabine, Abraxane® Plus 1 or 2 Truncated Courses of Demcizumab in Subjects With 1st-Line Metastatic Pancreatic Ductal Adenocarcinoma\nStudy URL: https://beta.clinicaltrials.gov/study/NCT02289898\nAcronym: YOSEMITE\nStudy Status: COMPLETED\nBrief Summary: This is a randomized, double blind, 3 arm (1:1:1) study in subjects with 1st-line metastatic pancreatic ductal adenocarcinoma.' metadata={}

SUMARY: This study aims to study the effectiveness of Demcizumab in treating 1st-line metastatic pancreatic ductal adenocarcinoma. The study is expected to be completed by June 2022.


Document: page_content='The purpose is to test the efficacy and safety of demcizumab, when given in combination with gemcitabine and Abraxane® compared to placebo. The administration of gemcitabine and Abraxane® is a standard treatment for patients with metastatic pancreatic ductal adenocarcinoma.\nS