# **llava-1.5-7b-hf**

**Each time you run, do a Restart & Clear cell outputs to free the GPU's.**

**Dependencies**

In [None]:
!pip install -q -U transformers==4.37.2 bitsandbytes==0.41.3 accelerate==0.25.0

## Load llava-1.5-7b-hf

In [None]:
import torch
from transformers import BitsAndBytesConfig, pipeline

# Load the model with quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})


## Caption
**You can set the image path, whether to overwrite existing txt files, and even the prompt.**

In [None]:
import os
import re
from PIL import Image

# Parameters
images_dir = 'set_image_path'  # Change this to your image directory
my_prompt = "Describe the image in detail and concisely, but without leaving anything out."
overwrite_existing = True  # Set to True to overwrite existing .txt files
print_captions = True  # Set to True to print captions to console
debug_mode = False  # Set to True to print debugging information

# Modify caption to remove some prefix. Credit: ProGamerGov
def modify_caption(caption: str) -> str:
    base_words = ['showcases ', 'portrays ', 'appears to be ', 'is ', 'depicts ', 'features ']
    prefix_substrings = [("The image " + s, '') for s in base_words] + [("This image " + s, '') for s in base_words]
    prefix_substrings += [("In this " + s, '') for s in ["picture, ", "depiction, ", "piece, ", "image, ", "scene, "]]
    prefix_substrings += [
        ('In this artwork, ', 'Artwork of '),
        ('In this illustration, ', 'Illustration of '),
        ('In this art piece, ', 'Art of ')
    ]
    pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings])
    replacers = {opening: replacer for opening, replacer in prefix_substrings}
    
    def replace_fn(match):
        return replacers[match.group(0)]
    
    return re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE).capitalize()


# Function to generate caption for a single image
def generate_caption(image, model, max_new_tokens=200):
    prompt = f"USER: <image>\n{my_prompt}\nASSISTANT:"
    outputs = model(image, prompt=prompt, generate_kwargs={"max_new_tokens": max_new_tokens})
    full_output = outputs[0]['generated_text']
    
    if debug_mode:
        print(f"Full model output: {full_output}")
    
    assistant_response = full_output.split("ASSISTANT:")[-1].strip()
    
    if debug_mode:
        print(f"Extracted assistant response: {assistant_response}")
    
    # Remove any remaining "ASSISTANT:" prefix and apply modify_caption
    caption_text = re.sub(r'(?:^|\n)\s*ASSISTANT:\s*', '', assistant_response, flags=re.IGNORECASE).strip()
    final_caption = modify_caption(caption_text)
    
    if debug_mode:
        print(f"Final caption text after modification: {final_caption}")
    
    return final_caption


# Process images
for image_filename in os.listdir(images_dir):
    if image_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(images_dir, image_filename)
        txt_filename = os.path.splitext(image_filename)[0] + '.txt'
        txt_path = os.path.join(images_dir, txt_filename)
        
        if os.path.exists(txt_path):
            if overwrite_existing:
                if print_captions:
                    print(f"Overwriting existing caption for {image_filename}.")
            else:
                if print_captions:
                    print(f"Skipping {image_filename} as caption file already exists.")
                continue
        
        image = Image.open(image_path)
        caption = generate_caption(image, pipe)
        
        with open(txt_path, 'w') as txt_file:
            txt_file.write(caption)
        
        if print_captions:
            print(f"Caption for {image_filename}: {caption}")
        
        print(f"Caption saved to {txt_filename}")

print("Processing complete.")
