## Post-process a finetuned LLM

Test and upload dfurman/mpt-7b-dolphin, a finetuned language model for short-form instruction following.

<a target="_blank" href="https://colab.research.google.com/github/daniel-furman/sft-demos/blob/main/src/sft/one_gpu/mpt-7b/dolphin/postprocessing-instruction-mpt-7b-dolphin.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Setup

In [None]:
!pip install transformers accelerate einops
!pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python

In [None]:
!git clone https://github.com/daniel-furman/sft-demos.git

In [None]:
import transformers
import torch
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

In [None]:
!nvidia-smi

In [None]:
free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print("max memory: ", max_memory)

## Loss curve

In [None]:
# assumes you have already run `sft-instruction-llm.py`
# see documentation and comments therein for details
# recommendation: run `sft-instruction-llm.py` directly from the terminal
# by default, the mpt-7b instruction model demo is used


# load the train loss from the finetuning events log
train_steps = []
train_loss = []
path_to_events_file = "/content/sft-demos/runs/jul_5_23_3_15_00_sft-mpt-7b-dolphin/events.out.tfevents.1688487599.209-20-156-79.36094.0"
for e in tf.compat.v1.train.summary_iterator(path_to_events_file):
    for v in e.summary.value:
        if v.tag == "train/loss":
            train_loss.append(v.simple_value)
            train_steps.append(e.step)

# load the val loss from the finetuning events log
val_steps = []
val_loss = []
path_to_events_file = "/content/sft-demos/runs/jul_5_23_3_15_00_sft-mpt-7b-dolphin/events.out.tfevents.1688487599.209-20-156-79.36094.0"
for e in tf.compat.v1.train.summary_iterator(path_to_events_file):
    for v in e.summary.value:
        if v.tag == "eval/loss":
            val_loss.append(v.simple_value)
            val_steps.append(e.step)

In [None]:
# plt.style.use("seaborn")
plt.figure(figsize=(10, 6))
plt.plot(train_steps, train_loss, label="train/loss")
plt.plot(val_steps, val_loss, label="eval/loss", color="orange")
plt.title('Cross entropy loss for "dfurman/mpt-7b-dolphin"', fontsize=17)
plt.ylabel("Cross entropy loss")
plt.xlabel("Train global setp")
plt.grid(which="major", axis="y")
plt.legend()
plt.show()

With a supervised finetuned (sft) model in hand, we can test it on some basic prompts and then upload it to the Hugging Face hub either as a public or private model repo, depending on the use case.

In [None]:
# load assets

model_id = "dfurman/mpt-7b-dolphin"

# mpt tokenizer load
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# mpt llm load
config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=True)

# custom options
config.attn_config["attn_impl"] = "torch"  # Custom cuda kernels
config.init_device = "meta"  # For fast initialization directly on GPU!
# config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
config.torch_dtype = "float16"  # Set float16 data type

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    config=config,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

config

## Basic instruction tests

In [None]:
# text generation function


def mpt_generate(
    model: transformers.AutoModelForCausalLM,
    tokenizer: transformers.AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: int = 1.0,
) -> str:
    """
    Initialize the pipeline
    Uses Hugging Face GenerationConfig defaults
        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig
    Args:
        model (transformers.AutoModelForCausalLM): Model for text generation
        tokenizer (transformers.AutoTokenizer): Tokenizer for model
        prompt (str): Prompt for text generation
        max_new_tokens (int, optional): Max new tokens after the prompt to generate.
            Defaults to 128.
        temperature (float, optional): The value used to modulate the next token probabilities.
            Defaults to 1.0
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )  # tokenize inputs, load on device

    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.
    with torch.autocast("cuda", dtype=torch.float16):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )  # grab output in natural language

    return decoded_output[len(prompt) :]  # remove prompt from output

In [None]:
prompt = "You are a helpful assistant. Write me a numbered list of things to do in New York City.\n"

response = mpt_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=300,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Daniel is in need of a haircut. His barber works Mondays, Wednesdays, and Fridays. So, Daniel went in for a haircut on Sunday. Does this make logical sense? Let's work this out in a step by step fashion to make sure we have the right answer.\n"

response = mpt_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=300,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Write a short email inviting my friends to a dinner party on Friday. Respond succinctly.\n"

response = mpt_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=175,
    temperature=0.92,
)

print(response)

In [None]:
prompt = "You are a helpful assistant. Here is a recipe for vegan banana bread:\n"

response = mpt_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=150,
    temperature=0.92,
)

print(response)

## Upload model to Hugging Face
1. Before running the cells below, create a model on your Hugging Face account. It can be a private or public repo and work with the below code.

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!git config --global credential.helper store

In [None]:
# push to hub
model_id_load = "dfurman/mpt-7b-dolphin"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)
# torch tensors
model.push_to_hub(model_id_load, use_auth_token=True)