In [None]:
# Import dependencies
import os
import mlflow
import nvidia
import time

cuda_install_dir = '/'.join(nvidia.__file__.split('/')[:-1]) + '/cuda_runtime/lib/'
os.environ['LD_LIBRARY_PATH'] =  cuda_install_dir

In [None]:
# Load the LLM

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig

model_id = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
        )

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Falcon requires you to allow remote code execution. This is because the model uses a new architecture that is not part of transformers yet.
# The code is provided by the model authors in the repo.

# Please change the cache_dir location to one that you have access to
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, trust_remote_code=True, device_map="auto", cache_dir='/mnt/code/')

In [None]:
# Set the Falcon tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [None]:
#Prepare the model for PEFT

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# Set the parameters for the LoRA adapter

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
        ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("samsum")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

In [None]:
from random import randint

# custom instruct prompt start
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n{{summary}}{{eos_token}}"

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = prompt_template.format(dialogue=sample["dialogue"],
                                            summary=sample["summary"],
                                            eos_token=tokenizer.eos_token)
    return sample

# apply prompt template per sample
train_dataset = dataset["train"].map(template_dataset, remove_columns=list(dataset["train"].features))
print(train_dataset[randint(0, len(train_dataset))]["text"])

In [None]:
# apply prompt template per sample
test_dataset = dataset["test"].map(template_dataset, remove_columns=list(dataset["test"].features))

In [None]:
# tokenize and chunk dataset
lm_train_dataset = train_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, batch_size=24, remove_columns=list(train_dataset.features)
)


lm_test_dataset = test_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(test_dataset.features)
)

# Print total number of samples
print(f"Total number of train samples: {len(lm_train_dataset)}")
print(f"Total number of test samples: {len(lm_test_dataset)}")


In [None]:
import transformers

# We set num_train_epochs=1, and batch sizes to 4 please change this to improve the accuracy and throughput

trainer = transformers.Trainer(
    model=model,
    train_dataset=lm_train_dataset,
    eval_dataset=lm_test_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        logging_dir='/mnt/hf_logs',
        logging_steps=5,
        num_train_epochs=1,
        learning_rate=2e-4,
        bf16=True,
        save_strategy = "epoch",
        report_to=["mlflow"],
        output_dir="/mnt/code/falcon_7b_8bit_lora_outputs", # this is the location where the adapter checkpoints will be stored
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
experiment_name = 'falcon-7b-8bit-lora-ft'
exp = mlflow.set_experiment(experiment_name)
print("Fine-tuning model:")
with mlflow.start_run() as run:
    # Start training
    trainer.train()

In [None]:
#evaluate and return the metrics
trainer.evaluate()

In [None]:
# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")

# select a random test sample
sample = test_dataset[randint(0, len(test_dataset))]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

In [None]:
input_ids = tokenizer(test_sample, return_tensors="pt").input_ids
input_ids = input_ids.to('cuda')

In [None]:
from transformers import GenerationConfig

#set the tokens for the summary evaluation
tokens_for_summary = 30
output_tokens = input_ids.shape[1] + tokens_for_summary

model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

generation_config = GenerationConfig(
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            top_k = 10,
            max_length=500,
#             max_new_tokens= int(0.2*len(input_ids[0]))
        )

start_time = time.time()

with torch.no_grad():
    outputs = model.generate(inputs=input_ids, generation_config=generation_config)

end_time = time.time()
gen_text = tokenizer.batch_decode(outputs)[0]
print(gen_text)


In [None]:
print(f'\nTook {round(end_time - start_time, 3)} s')