### Instructions

This notebook can be run on [Google Colab Notebooks](https://colab.research.google.com/) using a GPU. 

* Change Runtime>Change Runtime Type to T4 GPU
* Install `peft`, `evaluate`, `transformers`, `datasets` and `rogue_score` using `! pip install` 
* Upload the two files in dataset into a folder called dataset

In [1]:
## Install packages 

# ! pip install peft
# ! pip install evaluate
# ! pip install transformers
# ! pip install datasets
# ! pip install rouge_score

# Import libs

In [1]:
import json
from typing import Dict

import numpy as np
import pandas as pd
import typer
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_int8_training,
)
import evaluate
from datasets import load_from_disk
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import Dataset, concatenate_datasets


  from .autonotebook import tqdm as notebook_tqdm


: 

: 

# Pre-process dataset

In [None]:
def get_dataset(data_path: str) -> Dict[str, Dataset]:
    with open(data_path, "r") as f:
        data = json.load(f)

    format_data = []
    for sample in data:
        format_data.append(
            {
                "input_text": sample["question"],
                "output_text": sample["gpt4_replies_target"],
            }
        )
    format_data = pd.DataFrame(format_data)
    train, test = train_test_split(format_data, random_state=42, train_size=0.95)
    dataset_train = Dataset.from_pandas(train)
    dataset_test = Dataset.from_pandas(test)
    return {"train": dataset_train, "test": dataset_test}
    
print("Get data")

data_path = "dataset/dagster-support-dataset.json"
dataset = get_dataset(data_path=data_path)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

In [None]:
model_id_tokenizer = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_id_tokenizer)

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["input_text"], truncation=True),
    batched=True,
    remove_columns=["input_text", "output_text"],
)
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
max_source_lengths = int(np.percentile(input_lenghts, 95))
print(f"Max source lengths: {max_source_lengths}")

tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["output_text"], truncation=True),
    batched=True,
    remove_columns=["input_text", "output_text"],
)
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
max_target_lengths = int(np.percentile(target_lenghts, 95))
print(f"Max target lengths: {max_target_lengths}")

def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["input_text"]]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs, max_length=max_source_lengths, padding=padding, truncation=True
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["output_text"],
        max_length=max_target_lengths,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = {}
tokenized_dataset["train"] = dataset["train"].map(
    preprocess_function, batched=True, remove_columns=["input_text", "output_text"]
)
tokenized_dataset["test"] = dataset["test"].map(
    preprocess_function, batched=True, remove_columns=["input_text", "output_text"]
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["test"].save_to_disk("data/eval")

# Train LoRA model with Trainer. 


In [None]:
model_id_model = "google/flan-t5-small"
load_in_8bit = False
per_device_eval_batch_size = 8
per_device_train_batch_size = 8
gradient_accumulation_steps = 1
peft_model_id = "notebook-result"
num_train_epochs = 2

In [None]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_id_model, device_map="auto", load_in_8bit=load_in_8bit)

# Define LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.2,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
if load_in_8bit:
    model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8,
)
# Define training args
training_args = Seq2SeqTrainingArguments(
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=per_device_eval_batch_size,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    output_dir=peft_model_id,
    auto_find_batch_size=True,
    learning_rate=1e-3,  # higher learning rate
    num_train_epochs=num_train_epochs,
    logging_dir=f"{peft_model_id}/logs",
    report_to="none",
)

print(f"training_args = {training_args}")
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)
model.config.use_cache = (
    False  # silence the warnings. Please re-enable for inference!
)

## Run training 

In [None]:
trainer.train()
trainer.evaluate()

## Save best model

In [None]:
# Save our LoRA model & tokenizer results
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
trainer.model.base_model.save_pretrained(peft_model_id)

# Evaluate

In [None]:
def evaluate_peft_model(sample, model, tokenizer, max_target_length=512):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels



In [None]:
config = PeftConfig.from_pretrained(peft_model_id)
# load base LLM model and tokenizer
# model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")

metric = evaluate.load("rouge")

# load test dataset from distk
test_dataset = load_from_disk("data/eval/").with_format("torch")

# run predictions
# this can take ~45 minutes
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample=sample, model=model, tokenizer=tokenizer, max_target_length=512)
    predictions.append(p)
    references.append(l)
    print("#" * 10)
    print(f"p = {p}, l = {l}")
# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")