## Post-process a finetuned LLM

Test and upload a finetuned language model

In [None]:
!pip install -q -U huggingface_hub peft transformers torch accelerate

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!git config --global credential.helper store

## Setup

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
!nvidia-smi

In [None]:
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("max memory: ", max_memory)

## Loss curve

During training, the model converged nicely as follows:

![image](https://raw.githubusercontent.com/daniel-furman/sft-demos/main/assets/jul_24_23_1_14_00_log_loss_curves_llama-2-70b-dolphin.png)


## Basic testing

With a supervised finetuned (sft) model in hand, we can test it on some basic prompts and then upload it to the Hugging Face hub either as a public or private model repo, depending on the use case.

In [None]:
# peft_model_id = "results/checkpoint-12500"
peft_model_id = "dfurman/llama-2-70b-dolphin-peft"
config = PeftConfig.from_pretrained(peft_model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    use_auth_token=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
model

In [None]:
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("max memory: ", max_memory)

In [None]:
# text generation function


def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: int = 1.0,
) -> str:
    """
    Initialize the pipeline
    Uses Hugging Face GenerationConfig defaults
        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig
    Args:
        model (transformers.AutoModelForCausalLM): Falcon model for text generation
        tokenizer (transformers.AutoTokenizer): Tokenizer for model
        prompt (str): Prompt for text generation
        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
        temperature (float, optional): The value used to modulate the next token probabilities.
            Defaults to 1.0
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )  # tokenize inputs, load on device

    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.
    with torch.autocast("cuda", dtype=torch.bfloat16):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )  # grab output in natural language

    return decoded_output[len(prompt) :]  # remove prompt from output

In [None]:
prompt = "You are a helpful assistant. Write me a numbered list of things to do in New York City.\n"

response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=250,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Daniel is in need of a haircut. His barber works Mondays, Wednesdays, and Fridays. So, Daniel went in for a haircut on Sunday. Does this make logical sense? Respond only with a yes or no answer in as few words as possible.\n"

response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Write a short email inviting my friends to a dinner party on Friday. Respond succinctly.\n"

response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=200,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Tell me a recipe for vegan banana bread.\n"

response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=500,
    temperature=0.92,
)

print(response)

## Inf runtime test

In [None]:
import tqdm
import time

prompt = "You are a helpful assistant. Write me a long list of things to do in San Francisco:\n"

runtimes = []
for i in tqdm.tqdm(range(25)):
    start = time.time()
    response = llama_generate(
        model,
        tokenizer,
        prompt,
        max_new_tokens=50,
        temperature=0.92,
    )
    end = time.time()
    runtimes.append(end - start)
    assert len(tokenizer.encode(response)) == 52

In [None]:
avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

## Upload model to Hugging Face
1. Before running the cells below, create a model on your Hugging Face account. It can be a private or public repo and work with the below code.

In [None]:
# push to hub
model_id_load = "dfurman/llama-2-70b-dolphin-peft"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)
# torch tensors
model.push_to_hub(model_id_load, use_auth_token=True)