# **Florence-2-base-PromptGen-v1.5**

**Dependencies**

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers bitsandbytes accelerate flash_attn einops

## Load Florence-2-base-PromptGen-v1.5
Load only one time

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

# Load the model and processor
model_id = "MiaoshouAI/Florence-2-base-PromptGen-v1.5"
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Set up device (if CUDA is available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Caption
**You can set the image path, whether to overwrite existing txt files, and even the prompt.**

In [None]:
import os
import shutil
from PIL import Image

# Parameters

# Set the variable path to the source directory
source_path = '/kaggle/input/x'  # Update this to the path where your images are located

images_dir = '/kaggle/working/x'  # Change this to your image directory

# Create the destination directory if it doesn't exist
os.makedirs(images_dir, exist_ok=True)

# Copy image files from source_path to images_dir
for file_name in os.listdir(source_path):
    if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
        full_file_name = os.path.join(source_path, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, images_dir)

my_prompt = "Describe the image in detail and concisely, but without leaving anything out."
# prompt = "<MORE_DETAILED_CAPTION>"
overwrite_existing = True  # Set to True to overwrite existing .txt files
print_captions = True  # Set to True to print captions to console
debug_mode = False  # Set to True to print debugging information

# Function to generate caption for a single image
def generate_caption(image, model, processor, prompt, max_new_tokens=1024):
    # Convert image to RGB to ensure compatibility
    image = image.convert("RGB")
    
    # Process the image and prompt
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
    
    if debug_mode:
        print(f"Full model output: {generated_text}")
        print(f"Parsed answer: {parsed_answer}")

    # Extract the caption text from the dictionary
    caption = parsed_answer.get(prompt, "")
    
    return caption

# Process images
for image_filename in os.listdir(images_dir):
    if image_filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(images_dir, image_filename)
        txt_filename = os.path.splitext(image_filename)[0] + '.txt'
        txt_path = os.path.join(images_dir, txt_filename)
        
        if os.path.exists(txt_path):
            if overwrite_existing:
                if print_captions:
                    print(f"Overwriting existing caption for {image_filename}.")
            else:
                if print_captions:
                    print(f"Skipping {image_filename} as caption file already exists.")
                continue
        
        image = Image.open(image_path)
        caption = generate_caption(image, model, processor, my_prompt)
        
        with open(txt_path, 'w') as txt_file:
            txt_file.write(caption)
        
        if print_captions:
            print(f"Caption for {image_filename}: {caption}")
        
        print(f"Caption saved to {txt_filename}")

print("Processing complete.")

# zip the output

In [None]:
!zip -r x.zip /kaggle/working/x