In [1]:
import torch
import pandas as pd
from unsloth import FastLanguageModel
from datasets import load_from_disk
from tqdm import tqdm
import numpy as np
from PIL import Image

# Define paths
# MODEL_OUTPUT_DIR = "outputs/llama-3.2-11b-surgical-cholecT50---10frames_proc+task"
# BASE_MODEL_NAME = "nvidia/Llama-3.2-11B-Vision-Surgical-CholecT50"

# MODEL_OUTPUT_DIR = "outputs/llama-3.2-11b---10frames_proc+task"
MODEL_OUTPUT_DIR = "outputs/llama-3.2-11b---10frames_proc+task_strict_instruction"
BASE_MODEL_NAME = "nvidia/Llama-3.2-11B-Vision"
LORA_ADAPTERS_PATH = f"{MODEL_OUTPUT_DIR}/lora_model"
VALIDATION_DATA_PATH = f"outputs/llama-3.2-11b---10frames_proc+task_strict_instruction/val_ds"

# Inference configuration
BATCH_SIZE = 4  # Adjust based on your GPU memory
MAX_NEW_TOKENS = 100 # Max tokens to generate for each response

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = LORA_ADAPTERS_PATH, # The path to our saved LoRA adapters
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# Set the model to evaluation mode
model.eval()

==((====))==  Unsloth 2025.8.5: Fast Mllama patching. Transformers: 4.55.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.635 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MllamaForConditionalGeneration(
  (model): MllamaModel(
    (vision_model): MllamaVisionModel(
      (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
      (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
        (tile_embedding): Embedding(9, 8197120)
      )
      (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
        (embedding): Embedding(9, 5120)
      )
      (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
        (embedding): Embedding(9, 5120)
      )
      (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (transformer): MllamaVisionEncoder(
        (layers): ModuleList(
          (0-31): 32 x MllamaVisionEncoderLayer(
            (self_attn): MllamaVisionAttention(
              (q_proj): Linear4bit(in_features=1280, out_features=1280, bias=False)
       

In [3]:
# Load the validation dataset from disk
val_ds = load_from_disk(VALIDATION_DATA_PATH)

print("Validation dataset loaded:")
print(val_ds)

Validation dataset loaded:
Dataset({
    features: ['answer', 'cvid', 'instrument', 'action', 'tissue', 'procedure', 'task', 'i', 'second', 'image', 'text'],
    num_rows: 8420
})


In [4]:
# Define the instruction and question templates used during training
# instruction = 'Respond with 1-2 sentences of actionable feedback focused on the most teachable instrument-action-tissue event.'
instruction = "Generate actionabl surgical feedback to a trainee surgeon in the context of robot-assisted urology surgery. Your output must be maximum 1 sentence. The surgical feedback must focus on the most teachable instrument-action-tissue event. The feedback must be concise and to the point. Do not generate any other information or context. Do not generate any additional text or explanation."
question_template = "Procedure: {procedure}\nTask: {task}"

def create_inference_prompt(example):
    """
    Creates a prompt for inference by formatting the user's part of the conversation.
    """
    question = question_template.format(procedure=example["procedure"], task=example["task"])
    
    # This structure must match the one used in `formatting_prompts_func` during training
    text_data = [
        {"role": "user", "content": [
            {"type": "text", "text": instruction},
            {"type": "image"}, 
            {"type": "text", "text": question},
        ]},
    ]
    
    # Use add_generation_prompt=True for inference
    prompt = tokenizer.apply_chat_template(text_data, tokenize=False, add_generation_prompt=True)
    return prompt

print("Example inference prompt:\n", create_inference_prompt(val_ds[0]))

Example inference prompt:
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>

Generate actionabl surgical feedback to a trainee surgeon in the context of robot-assisted urology surgery. Your output must be maximum 1 sentence. The surgical feedback must focus on the most teachable instrument-action-tissue event. The feedback must be concise and to the point. Do not generate any other information or context. Do not generate any additional text or explanation.<|image|>Procedure: radical prostatectomy
Task: Endopelvic fascia<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [5]:
def save_results(
    cvids,
    ground_truths,
    predictions,
    procedures,
    tasks,
    idxs,
    seconds
):
    # Create a DataFrame with the results
    results_df = pd.DataFrame({
        'cvid': cvids,
        'ground_truth': ground_truths,
        'prediction': predictions,
        'procedure': procedures,
        'task': tasks,
        'idx': idxs,
        'seconds': seconds
    })

    # Save the DataFrame to a CSV file
    output_csv_path = f"{MODEL_OUTPUT_DIR}/inference_results.csv"
    results_df.to_csv(output_csv_path, index=False)

    print(f"Inference complete. Results saved to {output_csv_path}")

    # Display the first 10 results for a quick check
    print("\nSample Inference Results:")
    pd.set_option('display.max_colwidth', None)
    display(results_df.head(10))

In [None]:
from tqdm import tqdm
predictions = []
ground_truths = []
cvids = []
procedures = []
tasks = []
idxs = []
seconds = []

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable parallelism to avoid warnings

# Process the dataset in batches
for i in tqdm(range(0, len(val_ds), BATCH_SIZE), desc="Running Inference"):
    batch_data = val_ds[i : i + BATCH_SIZE]
    keys = batch_data.keys()
    batch_data = [{key: batch_data[key][j] for key in keys} for j in range(len(batch_data['image']))]
    
    # 1. Prepare batch inputs
    prompts = [create_inference_prompt(ex) for ex in batch_data]
    images = [ex['image'] for ex in batch_data]

    # Convert PIL images to numpy arrays if they are not already
    images_nested = [[np.array(img)] for img in images]

    # 2. Tokenize text and prepare inputs for the model
    inputs = tokenizer(
        text=prompts,
        images=images_nested, # Pass the corrected nested list
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    # 3. Generate text
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, eos_token_id=tokenizer.eos_token_id, use_cache=False)
    
    # 4. Decode the generated text, skipping special tokens
    decoded_outputs = tokenizer.batch_decode(generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # 5. Store results
    predictions.extend(decoded_outputs)
    ground_truths.extend([ex['answer'] for ex in batch_data])
    cvids.extend([ex['cvid'] for ex in batch_data])
    procedures.extend([ex['procedure'] for ex in batch_data])
    tasks.extend([ex['task'] for ex in batch_data])
    idxs.extend([ex['i'] for ex in batch_data])
    seconds.extend([ex['second'] for ex in batch_data])
    
    save_results(
        cvids=cvids,
        ground_truths=ground_truths,
        predictions=predictions,
        procedures=procedures,
        tasks=tasks,
        idxs=idxs,
        seconds=seconds
    )

In [None]:
# Create a DataFrame with the results
results_df = pd.DataFrame({
    'cvid': cvids,
    'ground_truth': ground_truths,
    'prediction': predictions,
    'procedure': procedures,
    'task': tasks,
    'idx': idxs,
    'seconds': seconds
})

# Save the DataFrame to a CSV file
output_csv_path = f"{MODEL_OUTPUT_DIR}/inference_results.csv"
results_df.to_csv(output_csv_path, index=False)

print(f"Inference complete. Results saved to {output_csv_path}")

# Display the first 10 results for a quick check
print("\nSample Inference Results:")
pd.set_option('display.max_colwidth', None)
display(results_df.head(10))