In [2]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModel, AutoTokenizer
import torch
from model.openllama import OpenLLAMAPEFTModel

# Initialize the model
args = {
    'model': 'openllama_peft',
    'imagebind_ckpt_path': '../pretrained_ckpt/imagebind_ckpt',
    'vicuna_ckpt_path': '../pretrained_ckpt/vicuna_ckpt/converted/vicuna_full',
    'delta_ckpt_path': '../pretrained_ckpt/pandagpt_ckpt/7b/pandagpt_7b_max_len_1024/pytorch_model.pt',
    'stage': 2,
    'max_tgt_len': 128,
    'lora_r': 32,
    'lora_alpha': 32,
    'lora_dropout': 0.1,
}
print("Initializing model...")
model = OpenLLAMAPEFTModel(**args)
delta_ckpt = torch.load(args['delta_ckpt_path'], map_location=torch.device('cpu'))
model.load_state_dict(delta_ckpt, strict=False)
model = model.eval().half().cuda()
print("Model initialized.")

def generate_response(prompt_text, image_path=None, audio_path=None, video_path=None, thermal_path=None, top_p=0.01, temperature=1.0, max_length=128):
    """Generate a response from the model."""
    response = model.generate({
        'prompt': prompt_text,
        'image_paths': [image_path] if image_path else [],
        'audio_paths': [audio_path] if audio_path else [],
        'video_paths': [video_path] if video_path else [],
        'thermal_paths': [thermal_path] if thermal_path else [],
        'top_p': top_p,
        'temperature': temperature,
        'max_tgt_len': max_length,
        'modality_embeds': []

    })
    return response



Initializing model...
Initializing visual encoder from ../pretrained_ckpt/imagebind_ckpt ...
Visual encoder initialized.
Initializing language decoder from ../pretrained_ckpt/vicuna_ckpt/converted/vicuna_full ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


trainable params: 33554432 || all params: 6771978240 || trainable%: 0.49548936530546206
Language decoder initialized.
Model initialized.


In [7]:
# Example usage
prompt_text = "Can you describe the image?"
image_path = "/home/tz362/Desktop/usenix_artifact_eval/adversarial_illusions/outputs/assets/street.png"  # Update this path to your image

# Generate response
response = generate_response(prompt_text, image_path=image_path)
print("Response:", response)


Response: The image shows a man standing in a field, holding a gun in his hand. He is wearing a white shirt and black pants, and appears to be pointing the gun towards the camera. The man's face is obscured by the gun, but it is clear that he is holding it with a serious expression. The background of the image is a grassy field with some trees visible in the distance.
