<h1>Local infernece with hf transformers LLAMA3.2</h1>
This notebook uses the hf transformers LLAMA3.2 16-bit and 4-bit quantized models for inference.</br>
This is done to confirm that when running inference locally we see the same errors as with OLLAMA,</br>
and to compare the results of the 16-bit and 4-bit quantized models.</br>

In [None]:
import os
import sys
import gc
import base64
from pathlib import Path
import pandas as pd
from PIL import Image
from torchvision import transforms
from io import BytesIO
from sklearn.model_selection import train_test_split
from transformers import MllamaForConditionalGeneration, AutoTokenizer, AutoProcessor, BitsAndBytesConfig
import torch

In [None]:
# Load the tokenizer and model
hf_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Define the quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for better performance
    bnb_4bit_quant_type="nf4"  # Specify quantization type (e.g., NF4)
)

In [None]:
# cleanup
model = None
tokenizer = None
processor = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Load the 4-bit quantized model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
processor = AutoProcessor.from_pretrained(hf_model_name)
model = MllamaForConditionalGeneration.from_pretrained(
    hf_model_name,
    device_map="auto",  # Automatically map model layers to available GPUs
    torch_dtype=torch.bfloat16,  # Use float16 precision for faster inference
    quantization_config=quantization_config,  # Pass the quantization configuration
    low_cpu_mem_usage=True,
)
model16 = MllamaForConditionalGeneration.from_pretrained(
    hf_model_name,
    device_map="auto",  # Automatically map model layers to available GPUs
    torch_dtype=torch.bfloat16,  # Use float16 precision for faster inference
    low_cpu_mem_usage=True,
)

In [None]:
# Example adding LoRA adapters from fine tuning session
from peft import PeftModel
lora_adapter_path = f"{os.getcwd()}/trained/watchman_model_last_loras"
print(f"Path to the adapters: {lora_adapter_path}")
model = PeftModel.from_pretrained(
    model,
    lora_adapter_path,
    is_trainable=False,  # Set to False for inference; True if you want to continue fine-tuning
)

In [None]:
# set up vars for finding train data
dataset = "../.data/dataset" # dataset folder location
chans = ["porch"] # list of channels to load the data for
objs = ["person"] # list of objects to load the data for
model_name = "ollama-complex" # model interface name to use for inference experimentation
c_desc = {
    "porch": "Porch",
}
o_desc = {
    "person": "a person",
}

In [None]:
# load model interfaces we can use for testing
print(f"Working dir: {os.getcwd()}")
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../orchestrator"))
from shared_settings import *
from model_interfaces import *
print(MODELS)

In [None]:
# create an instance of the model_name interface (the connection refused error can be ignored, only need the prompt from the MODEL_ITERFACE)
MODEL_INTERFACE = MODELS[model_name]()

In [None]:
# Define an example image as input in base64 format
def load_image_from_base64(base64_string):
    # Decode the base64 string
    image_data = base64.b64decode(base64_string)
    image = Image.open(BytesIO(image_data)).convert("RGB")
    return image

# Define an example image as input in base64 format
def load_image_from_file(pname):
    # Load the image
    image = Image.open(pname).convert("RGB")
    return image

def test_inf(model, s, c, o, res):
    print(f"Model: {model.device} Subdir: {s} Expecting: {res}")
    image_pname = f"{s}/image.jpg"
    # Define the input prompt for completion
    prompt = MODEL_INTERFACE.gen_detect_prompt(o_desc[o], c_desc[c])
    # Combine the image w/ the text input (tokenizes text too)
    image = load_image_from_file(image_pname)
    inputs = processor(image, prompt, return_tensors="pt").to(model.device)
    # Generate the model's completion
    output = model.generate(
        **inputs,
        max_length=200,  # Adjust max_length as needed
        temperature=None,  # Adjust temperature for creativity
        top_p=None,  # Use nucleus sampling for diversity
        do_sample=False  # Enable sampling for non-deterministic output
    )
    # Decode output
    rsp_prefix = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    img = Image.open(image_pname)
    w, h = img.size
    display(img.resize((int(w / 4), int(h / 4))))
    print("Inference result: ", response[len(rsp_prefix):])
    print("---------------------------")

In [None]:
# Try inference for both 4 and 16 bit models on one true and one false positive sample from the datatset
for c in chans:
    for o in objs:
        dir = f"{dataset}/{c}/{o}"
        subdirs = [f.path for f in os.scandir(dir) if f.is_dir()]
        true_pos = False
        false_pos = False
        for s in subdirs:
            if true_pos and false_pos:
                break
            if os.path.exists(f"{s}/skip"):
                continue
            if not false_pos and os.path.exists(f"{s}/no"):
                test_inf(model, s, c, o, "no")
                test_inf(model16, s, c, o, "no")
                false_pos = True
            if not true_pos and not os.path.exists(f"{s}/no"):
                test_inf(model, s, c, o, "yes")
                test_inf(model16, s, c, o, "yes")
                true_pos = True

Surprisingly, the 4-bit quantized model does not generate the false positives where the 16bit one does.
Let's run a full sweep over the collected data to compare how one performs vs the other.

In [None]:
def check_inf(model, s, c, o, res):
    image_pname = f"{s}/image.jpg"
    # Define the input prompt for completion
    prompt = MODEL_INTERFACE.gen_detect_prompt(o_desc[o], c_desc[c])
    # Combine the image w/ the text input (tokenizes text too)
    image = load_image_from_file(image_pname)
    inputs = processor(image, prompt, return_tensors="pt").to(model.device)
    # Generate the model's completion
    output = model.generate(
        **inputs,
        max_length=200,  # Adjust max_length as needed
        temperature=None,  # Adjust temperature for creativity
        top_p=None,  # Use nucleus sampling for diversity
        do_sample=False  # Enable sampling for non-deterministic output
    )
    # Decode output
    rsp_prefix = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    res_idx = response[len(rsp_prefix):].lower().find(res)
    return 1 if res_idx < 0 else 0

total = 0
total_4b_no = 0
total_16b_no = 0
total_4b_yes = 0
total_16b_yes = 0
for c in chans:
    for o in objs:
        dir = f"{dataset}/{c}/{o}"
        subdirs = [f.path for f in os.scandir(dir) if f.is_dir()]
        for s in subdirs:
            if os.path.exists(f"{s}/skip"):
                continue
            if os.path.exists(f"{s}/no"):
                cur_err_no = check_inf(model, s, c, o, "no")
                cur_err16_no = check_inf(model16, s, c, o, "no")
                cur_err_yes = 0
                cur_err16_yes = 0
            else:
                cur_err_yes = check_inf(model, s, c, o, "yes")
                cur_err16_yes = check_inf(model16, s, c, o, "yes")
                cur_err_no = 0
                cur_err16_no = 0
            total_4b_no += cur_err_no
            total_16b_no += cur_err16_no
            total_4b_yes += cur_err_yes
            total_16b_yes += cur_err16_yes
            total += 1
            _, dn = os.path.split(s)
            print(f"\rTotal:{total} err16b_no:{cur_err16_no} err4b_no:{cur_err_no} err16b_yes:{cur_err16_yes} err4b_yes:{cur_err_yes} Subdir:{c}/{o}/{dn}      ", end=("\n" if not (cur_err16_no == cur_err_no and cur_err16_yes == cur_err_yes)  else ""))

print(f"\n----------------------------------------------------------")
print(f"\nSummary: out of {total} false No 4bit:{total_4b_no} 16bit:{total_16b_no}, false Yes 4bit:{total_4b_yes} 16bit:{total_16b_yes}")


It looks like 4bit quantized model was a bit better at negative detection. It missed some true positive cases though.</br>
It's quite possible that it's just the side effect of the dataset only containing all the "positives" the 16 bit model detected (including errors).</br>
Perhaps the 4bit could have made more false positive mistakes, just on different images (not represented here since 16bit rejected them correctly).

Note on the GPU parallelizm, the MllamaForConditionalGeneration.from_pretrained(..., device_map="auto", ...) should not be used with .to(device) as it's managing GPUs internally. "The model weights are not tied..." message can be ignored (it's incorrect).