In [None]:
# Import all the dependencies
import argparse
import ctranslate2
import json
import nvidia
import os
import time
import transformers
import torch

from datasets import load_dataset
from peft import PeftModel, PeftConfig
from random import randint
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
cuda_install_dir = '/'.join(nvidia.__file__.split('/')[:-1]) + '/cuda_runtime/lib/'
os.environ['LD_LIBRARY_PATH'] =  cuda_install_dir

In [None]:
# Convert a Huggingface transformers model to a ctranslate model for fast inference

# Please change the paths here according to your project setup
def convert_qlora2ct2(adapter_path: str = "/mnt/artifacts/falcon_7b_8bit_lora_outputs/checkpoint-3683",
                      offload_path:str ="/mnt/artifacts/ct2offload/",
                      full_model_path:str="/mnt/artifacts/falcon_7b_model_adapter",
                      ct2_path:str="/mnt/artifacts/ct2_int8",
                      quantization:str="int8"):

    # Load the LLM and its adapter
    peft_model_id = adapter_path
    peftconfig = PeftConfig.from_pretrained(peft_model_id)
    base_model_name_or_path = peftconfig.base_model_name_or_path

    model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, 
                                                device_map = "auto", 
                                                torch_dtype=torch.float16,
                                                trust_remote_code=True,
                                                offload_folder  = offload_path,
                                                resume_download=True,
                                                cache_dir='/mnt',
                                                local_files_only=False,
                                                )

    model.config.use_cache = True
    
    tokenizer = AutoTokenizer.from_pretrained(peftconfig.base_model_name_or_path)

    model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')

    print("Peft model loaded")
    
    # Merge the LLM and the adapter
    merged_model = model.merge_and_unload()
    
    # Save the merged model and the adapter
    merged_model.save_pretrained(full_model_path) 
    tokenizer.save_pretrained(full_model_path)

    # Convert the HF model to ctranslate
    if quantization == False:
        os.system(f"sudo ct2-transformers-converter --model {full_model_path} --output_dir {ct2_path} --trust_remote_code --force")
    else:
        os.system(f"sudo ct2-transformers-converter --model {full_model_path} --output_dir {ct2_path} --quantization {quantization} --force")
    print(" Model Converted successfully")

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
# Call the function to convert the fine tuned Falcon-7b model to a ctranslate model
convert_qlora2ct2()

In [None]:
# Load the converted model and the tokenizer
# Please change the path below according to your project
generator = ctranslate2.Generator("/mnt/artifacts/ct2_int8", device="cuda")
tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/falcon-7b")

In [None]:
# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")

# select a random test sample
sample = test_dataset[randint(0, len(test_dataset))]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

In [None]:
# Summarize the test sample
start_time = time.time()

tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(test_sample))

results = generator.generate_batch([tokens], sampling_topk=10, max_length=200, include_prompt_in_result=False)
output = tokenizer.decode(results[0].sequences_ids[0])
end_time = time.time()
print(output)

In [None]:
print(f'\n Generating the summary took {round(end_time - start_time, 3)} s')