# Using QVision MultiModel from Hugging Face Hub

This notebook demonstrates how to load and use the QVision MultiModel directly from Hugging Face Hub for image captioning tasks.

The model is available at: [verma75preetam/qvision-mutlimodel-base](https://huggingface.co/verma75preetam/qvision-mutlimodel-base)

## 1. Setup and Installation

First, let's install the required packages:

In [None]:
# Install required packages
!pip install huggingface_hub safetensors transformers matplotlib torch pillow

## 2. Import Required Libraries

In [None]:
import os
import sys
import torch
import json
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

## 3. Download Model Files from Hugging Face Hub

Now, we'll download the model files directly from Hugging Face Hub:

In [None]:
# Model repository on Hugging Face Hub
model_name = "verma75preetam/qvision-mutlimodel-base"

# Download model files
print("Downloading model files...")
config_path = hf_hub_download(repo_id=model_name, filename="config.json")
encoder_path = hf_hub_download(repo_id=model_name, filename="encoder.safetensors")
decoder_path = hf_hub_download(repo_id=model_name, filename="decoder.safetensors")
gpt_decoder_path = hf_hub_download(repo_id=model_name, filename="gpt_decoder.safetensors")

# Download helper scripts
encoder_py_path = hf_hub_download(repo_id=model_name, filename="encoder.py")
decoder_model_py_path = hf_hub_download(repo_id=model_name, filename="decoder_model.py")
generate_py_path = hf_hub_download(repo_id=model_name, filename="generate.py")
utils_py_path = hf_hub_download(repo_id=model_name, filename="utils.py")

# Print paths for confirmation
print(f"Config path: {config_path}")
print(f"Encoder model path: {encoder_path}")
print(f"Decoder model path: {decoder_path}")
print(f"GPT decoder model path: {gpt_decoder_path}")

## 4. Import Model Classes

Add the downloaded scripts to the Python path and import the model classes:

In [None]:
# Add the directory containing the scripts to the path
sys.path.append(os.path.dirname(encoder_py_path))

# Import the model classes
try:
    from encoder import CLIPEncoder
    from decoder_model import ResnetGPT2Wrapper
    from transformers import GPTNeoForCausalLM, AutoTokenizer
    from generate import generate_caption, visualize_caption
    print("Successfully imported model classes!")
except Exception as e:
    print(f"Error importing model classes: {e}")

## 5. Initialize the Tokenizer

In [None]:
# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("GPT-NEO-350M")
special_tokens = {"additional_special_tokens": ["<START>", "<END>"]}
tokenizer.add_special_tokens(special_tokens)

# Set pad token if not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

print(f"Tokenizer ready with vocabulary size: {tokenizer.vocab_size}")

## 6. Load Model Configuration

In [None]:
# Load the model configuration
print("Loading model configuration...")
with open(config_path, 'r') as f:
    config = json.load(f)

print("Model configuration:")
for key, value in config.items():
    print(f"  - {key}: {value}")

## 7. Initialize Model Components

In [None]:
# Initialize the encoder
print("Initializing encoder...")
encoder = CLIPEncoder(config['embed_size'])

# Initialize GPT model
print("Initializing GPT model...")
gpt_model = GPTNeoForCausalLM.from_pretrained("GPT-NEO-350M")
gpt_model.resize_token_embeddings(gpt_model.get_input_embeddings().num_embeddings + 2)  # For special tokens

# Initialize decoder
print("Initializing decoder...")
decoder = ResnetGPT2Wrapper(
    gpt_decoder=gpt_model,
    embed_size=config['embed_size'],
    vocab_size=config['vocab_size'],
    num_img_tokens=config['num_img_tokens'],
    pad_token_id=pad_token_id
)

print("Model components initialized successfully!")

## 8. Load Model Weights

In [None]:
# Load weights from safetensors
print("Loading model weights from safetensors...")
encoder_state_dict = load_file(encoder_path)
decoder_state_dict = load_file(decoder_path)
gpt_decoder_state_dict = load_file(gpt_decoder_path)

# Load state dictionaries into models
print("Applying weights to model components...")
encoder.load_state_dict(encoder_state_dict)
decoder.load_state_dict(decoder_state_dict)
decoder.gpt_decoder.load_state_dict(gpt_decoder_state_dict)

print("Model weights loaded successfully!")

## 9. Prepare Model for Inference

In [None]:
# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else 
                     "cuda" if torch.cuda.is_available() else 
                     "cpu")

print(f"Using device: {device}")

# Move models to device and convert to bfloat16 for efficiency
encoder = encoder.to(device).to(torch.bfloat16)
decoder = decoder.to(device).to(torch.bfloat16)

# Set models to evaluation mode
encoder.eval()
decoder.eval()

print("Model ready for inference!")

## 10. Define Helper Functions for Inference

In [None]:
def process_image(image_path, device):
    """Load and preprocess an image for the model."""
    # Load image
    image = Image.open(image_path).convert("RGB")
    
    # Apply transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
    
    # Process image
    image_tensor = transform(image).unsqueeze(0).to(device)
    
    return image, image_tensor

def display_image_with_caption(image, caption):
    """Display an image with its generated caption."""
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(image)
    plt.title("Input Image")
    plt.axis("off")
    
    plt.subplot(1, 2, 2)
    plt.text(0.1, 0.5, f"Caption: {caption}", fontsize=12, wrap=True)
    plt.axis("off")
    plt.title("Generated Caption")
    
    plt.tight_layout()
    plt.show()

## 11. Download a Sample Image for Testing

We'll download a sample image from Hugging Face Hub for testing purposes:

In [None]:
# Download a sample image
sample_image_path = hf_hub_download(repo_id=model_name, filename="sample_image.jpg")
print(f"Sample image downloaded to: {sample_image_path}")

# Display the sample image
sample_image = Image.open(sample_image_path)
plt.figure(figsize=(8, 6))
plt.imshow(sample_image)
plt.title("Sample Image")
plt.axis("off")
plt.show()

## 12. Generate Caption for the Sample Image

In [None]:
# Process the sample image
image, image_tensor = process_image(sample_image_path, device)

# Generate caption
print("Generating caption...")
with torch.no_grad():
    caption, generated_ids, _ = generate_caption(
        image_tensor,
        encoder,
        decoder,
        tokenizer,
        device,
        temperature=0.7,
        repetition_penalty=1.5
    )

print(f"\nGenerated caption: {caption}")

# Display the image with its caption
display_image_with_caption(image, caption)

## 13. Try with Your Own Images

You can upload your own images and generate captions for them:

In [None]:
# Function to handle image upload and caption generation
def caption_uploaded_image(uploaded_file):
    # Save the uploaded file
    with open("uploaded_image.jpg", "wb") as f:
        f.write(uploaded_file.getvalue())
    
    # Process the image
    image, image_tensor = process_image("uploaded_image.jpg", device)
    
    # Generate caption
    print("Generating caption...")
    with torch.no_grad():
        caption, generated_ids, _ = generate_caption(
            image_tensor,
            encoder,
            decoder,
            tokenizer,
            device,
            temperature=0.7,
            repetition_penalty=1.5
        )
    
    # Display the image with its caption
    display_image_with_caption(image, caption)
    
    return caption

# Note: If running in Jupyter Notebook, you can use the following code for file upload
# from ipywidgets import FileUpload
# uploader = FileUpload(accept='image/*', multiple=False)
# display(uploader)

# After upload, you can call:
# if uploader.value:
#     caption = caption_uploaded_image(list(uploader.value.values())[0])

## 14. Generate Captions for Multiple Images

If you have a directory of images, you can generate captions for all of them:

In [None]:
def generate_captions_for_directory(image_dir, limit=5):
    """Generate captions for all images in a directory."""
    # Get all image files
    image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".webp"]
    image_files = []
    
    for file in os.listdir(image_dir):
        if any(file.lower().endswith(ext) for ext in image_extensions):
            image_files.append(os.path.join(image_dir, file))
    
    # Limit the number of images processed
    if len(image_files) > limit:
        print(f"Found {len(image_files)} images, limiting to {limit} as specified.")
        image_files = image_files[:limit]
    else:
        print(f"Found {len(image_files)} images.")
    
    # Process each image
    results = []
    for i, img_path in enumerate(image_files):
        print(f"\nProcessing image {i+1}/{len(image_files)}: {os.path.basename(img_path)}")
        
        # Process image
        try:
            image, image_tensor = process_image(img_path, device)
            
            # Generate caption
            with torch.no_grad():
                caption, generated_ids, _ = generate_caption(
                    image_tensor,
                    encoder,
                    decoder,
                    tokenizer,
                    device,
                    temperature=0.7,
                    repetition_penalty=1.5
                )
            
            print(f"Caption: {caption}")
            results.append((img_path, caption, image))
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    
    # Display results
    n_images = len(results)
    if n_images > 0:
        fig = plt.figure(figsize=(15, 5 * n_images))
        
        for i, (img_path, caption, image) in enumerate(results):
            plt.subplot(n_images, 2, 2*i + 1)
            plt.imshow(image)
            plt.title(f"Image: {os.path.basename(img_path)}")
            plt.axis("off")
            
            plt.subplot(n_images, 2, 2*i + 2)
            plt.text(0.1, 0.5, f"Caption: {caption}", fontsize=12, wrap=True)
            plt.axis("off")
        
        plt.tight_layout()
        plt.show()
    
    return results

# Example usage (uncomment to run):
# image_directory = "path/to/your/images"
# results = generate_captions_for_directory(image_directory, limit=3)

## 15. Conclusion

In this notebook, we've demonstrated how to:

1. Load the QVision MultiModel directly from Hugging Face Hub
2. Initialize all model components
3. Generate captions for individual images
4. Process multiple images in a directory

The model uses a Q-Former architecture that efficiently connects CLIP's visual representations with GPT-Neo's text generation capabilities, making it suitable for various vision-language tasks beyond just captioning.