In [1]:
from transformers import pipeline
from peft import PeftModel
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-01-29 13:03:50.425357: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-29 13:03:50.427655: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-29 13:03:50.478824: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

def load_model_with_adapters(base_model_id: str,
                             adapter_dict: dict[str, str] = None):
    """Creates a base llama model with the correct quantization settings.
    Also returns the tokenizer."""
    
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=getattr(torch, "float16"),
        bnb_4bit_use_double_quant=False,
    )

    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=quant_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        add_bos_token=True, trust_remote_code=True)

    if adapter_dict:
        for adapter_name, adapter_path in adapter_dict.items():
            model = PeftModel.from_pretrained(model=model, 
                                            model_id=adapter_path,
                                            adapter_name=adapter_name,
                                            offload_folder="offload")
        # This basically creates a llama model and not a Peft Model.
        # model = model.merge_and_unload()
    
    return model, tokenizer

In [3]:
llama, llama_tokenizer = load_model_with_adapters(
    base_model_id="meta-llama/Llama-2-7b-chat-hf"
)

inference_pipeline = pipeline(
    task="text-generation",
    model=llama,
    tokenizer=llama_tokenizer
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.18s/it]


In [4]:
conversation = [
    {"role": "user", "content": "hi, how are you?"},
    {"role": "system", "content": "I'm good, how are you?"},
    {"role": "user", "content": "I love to eat meat"},
]

conversation = llama_tokenizer.apply_chat_template(conversation, tokenize=False)

In [5]:
inference_pipeline(conversation, max_new_tokens=64, prompt_lookup_num_tokens=10, return_full_text=False)

[{'generated_text': "  I'm just an AI, I don't have personal preferences or taste buds, but I can provide information on different types of meat and their nutritional values if you're interested! 😊\n\nMeat is a good source of protein, vitamins,"}]