# fp4 mixed-precision inference

Using [`load_in_4bit=True`](https://huggingface.co/docs/transformers/perf_infer_gpu_one#running-fp4-models-single-gpu-setup-quickstart). Compare to the baseline performance [here](https://e2-dogfood.staging.cloud.databricks.com/?o=6051921418418893#notebook/418210139975057).

- Low memory usage; reduced speed compared to baseline but faster than int8

In [0]:
%pip install --upgrade torch transformers accelerate huggingface_hub bitsandbytes
dbutils.library.restartPython()

In [0]:
from utils import generate_text, clear_model, torch_profile_to_dataframe, wrap_module_with_profiler
import huggingface_hub
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer, pipeline
import os
import datetime
import time
import accelerate



prompts = [
    "Dreams are",
    "The future of technology is",
    "In a world where magic exists,",
    "The most influential person in history is",
    "One of the most intriguing mysteries of the universe is",
    "When humans finally ventured out into the cosmos, they discovered",
    "The relationship between artificial intelligence and humanity has always been",
    "As the boundaries of science and fiction blur, the implications for society become",
    "In the depths of the enchanted forest, ancient creatures and forgotten tales come to life, revealing",
    "While many believe that technological advancements will be the key to solving humanity's greatest challenges, others argue that it will only exacerbate existing inequalities, leading to"
]

In [0]:
huggingface_hub.login()

In [0]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf", use_cache=True, padding_side="left"
)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    trust_remote_code=True,
    quantization_config=nf4_config
)

## Inspect Model

In [0]:
model

# Throughput and Memory

## Serial inputs

In [0]:
out = generate_text(prompts, model, tokenizer, batch=False,
              eos_token_id=tokenizer.eos_token_id, max_new_tokens=50)
pd.DataFrame(out)

## Batch inputs

In [0]:
out = generate_text(prompts, model, tokenizer, batch=True,
              eos_token_id=tokenizer.eos_token_id, max_new_tokens=50)
pd.DataFrame(out)

# Torch Profiling -- Basic

In [0]:
import torch.profiler as profiler

with profiler.profile(
    record_shapes=True,
    profile_memory=True,
    activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
) as prof:
  output = generate_text(prompts, model, tokenizer, eos_token_id=tokenizer.eos_token_id,
                         max_new_tokens=10)

torch_profile_to_dataframe(prof).sort_values("Self CUDA %", ascending=False)