**ONLY RUN IN COLAB WITH GPU ENABLED**

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import requests

In [None]:
# Install required packages
!pip install auto-gptq transformers accelerate pillow optimum qwen-vl-utils



Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting optimum
  Downloading optimum-1.23.3-py3-none-any.whl.metadata (20 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting datasets (from auto-gptq)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting coloredlogs (from optimum)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->optimum)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (f

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
from qwen_vl_utils import process_vision_info

# Load the model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)

min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28

processor = AutoProcessor.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
)


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [None]:
# Load the image
image_path = '/content/bio_image.png'
image = Image.open(image_path)

prompt = """
Generate a detailed paragraph description of this scientific figure that:
1. Starts with the complete figure number and reference
2. Flows naturally while incorporating these key elements:
   - The type of visualization and technical specifications (e.g., microscopy type, magnification, scale)
   - Description of what is being shown and how it's organized in the image
   - Any staining, coloring, or visual techniques used
   - Quantitative measurements or comparisons present
   - Key features or differences being demonstrated
   - The biological/chemical concept being illustrated
   - Educational purpose of the figure
3. Ends with any figure credits or attributions

Write as a single cohesive paragraph that naturally integrates the caption information with visual details. Focus on details that would help distinguish this figure from similar ones in a textbook. Avoid bullet points or sectioned formatting. Use natural transitions and maintain a scientific tone.
"""

# Prepare the messages format
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image,  # Pass the PIL Image directly
            },
            {"type": "text", "text": prompt},
        ],
    }
]

# Prepare inputs for inference
text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt"
)
inputs = inputs.to("cuda")  # Or use "cpu" if you don't have a GPU

# Generate the response
generated_ids = model.generate(**inputs, max_new_tokens=512,early_stopping=True)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
    generated_ids_trimmed,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)[0]

print(response)

Figure 4.4, titled "These uterine cervix cells, viewed through a light microscope, are from a Pap smear. Normal cells are on the left. The cells on the right are infected with human papillomavirus (HPV). Notice that the infected cells are larger. Also, two of these cells each have two nuclei instead of one, the normal number." This figure is a light microscopic visualization of uterine cervix cells from a Pap smear. The cells are stained blue, indicating the presence of a specific stain or dye. The cells are organized in a two-column format, with the left column representing normal cells and the right column representing infected cells. The infected cells are larger and have two nuclei instead of the normal number of one. The scale bar at the bottom of the image indicates that each unit represents 10 micrometers. The figure is a modification of work by Ed Uthman, MD, and the scale bar data is from Matt Russell. This figure serves as an educational tool to illustrate the differences bet

In [None]:
import os
import json
import time
from pathlib import Path
from datetime import datetime
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
from qwen_vl_utils import process_vision_info
import torch

def format_time(seconds):
    """Convert seconds to human readable time format"""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{int(hours)}h {int(minutes)}m {int(seconds)}s"

def process_single_image(image_path, model, processor, prompt):
    """Process a single image following the known working pattern"""
    try:
        # Load the image
        image = Image.open(image_path)

        # Prepare the messages format
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # Prepare inputs for inference
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        if image_inputs is None:
            raise ValueError("process_vision_info returned None for image inputs")

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Generate the response
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            early_stopping=True
        )
        generated_ids_trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        response = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]

        return response.strip()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

def process_images_in_folder(input_folder, output_jsonl, batch_size=1, max_images=None):
    """
    Process images in a folder and save descriptions to a JSONL file
    """
    print("\n=== Starting Image Processing Pipeline ===")
    start_time = time.time()

    # Load the model and processor
    print("\nInitializing model and processor...")
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct",
        torch_dtype="auto",
        device_map="auto"
    )

    min_pixels = 256 * 28 * 28
    max_pixels = 1280 * 28 * 28

    processor = AutoProcessor.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct",
        min_pixels=min_pixels,
        max_pixels=max_pixels
    )
    print("Model initialization complete!")

    # Define the prompt template
    prompt = """Generate a detailed paragraph description of this scientific figure that:
1. Starts with the complete figure number and reference
2. Flows naturally while incorporating these key elements:
   - The type of visualization and technical specifications (e.g., microscopy type, magnification, scale)
   - Description of what is being shown and how it's organized in the image
   - Any staining, coloring, or visual techniques used
   - Quantitative measurements or comparisons present
   - Key features or differences being demonstrated
   - The biological/chemical concept being illustrated
   - Educational purpose of the figure
3. Ends with any figure credits or attributions

Write as a single cohesive paragraph that naturally integrates the caption information with visual details. Focus on details that would help distinguish this figure from similar ones in a textbook. Avoid bullet points or sectioned formatting. Use natural transitions and maintain a scientific tone."""

    # Get list of image files
    print("\nScanning for images...")
    image_extensions = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
    image_files = [
        f for f in Path(input_folder).glob('*')
        if f.suffix.lower() in image_extensions
    ]
    total_images = len(image_files)
    print(f"Found {total_images} images to process")

    # Check for existing progress
    existing_processed = set()
    if os.path.exists(output_jsonl):
        with open(output_jsonl, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    existing_processed.add(data['img_path'])
                except:
                    continue

    remaining_images = [f for f in image_files if str(f) not in existing_processed]

    # Apply max_images limit if specified
    if max_images is not None:
        remaining_to_process = max_images - len(existing_processed)
        if remaining_to_process <= 0:
            print(f"\nAlready processed {len(existing_processed)} images, which meets or exceeds the requested max_images={max_images}")
            return
        remaining_images = remaining_images[:remaining_to_process]
        print(f"Will process {len(remaining_images)} more images to reach max_images={max_images}")

    print(f"Found {len(existing_processed)} previously processed images")
    print(f"{len(remaining_images)} images remaining to process")

    if not remaining_images:
        print("\nAll images have already been processed!")
        return

    processed_count = len(existing_processed)
    errors_count = 0

    print("\n=== Starting Processing ===")
    print(f"Processing in batches of {batch_size} images")

    # Process images in batches
    for i in range(0, len(remaining_images), batch_size):
        batch_files = remaining_images[i:i + batch_size]
        batch_start_time = time.time()

        print(f"\nProcessing batch {i//batch_size + 1}/{(len(remaining_images) + batch_size - 1)//batch_size}")
        print(f"Batch contains {len(batch_files)} images")

        # Process each image in the batch
        for img_path in batch_files:
            print(f"\nProcessing {img_path.name}")
            response = process_single_image(img_path, model, processor, prompt)

            if response is not None:
                # Save successful result
                result = {
                    'img_path': str(img_path),
                    'description': response
                }

                with open(output_jsonl, 'a', encoding='utf-8') as f:
                    json.dump(result, f, ensure_ascii=False)
                    f.write('\n')

                processed_count += 1
                print(f"Successfully processed {img_path.name}")
            else:
                errors_count += 1

        # Calculate and display progress statistics
        batch_time = time.time() - batch_start_time
        elapsed_time = time.time() - start_time
        remaining_images_count = total_images - processed_count

        if processed_count > 0:
            avg_time_per_image = elapsed_time / processed_count
            estimated_remaining_time = remaining_images_count * avg_time_per_image

            print(f"\nBatch Progress Update:")
            print(f"├── Time for this batch: {format_time(batch_time)}")
            print(f"├── Average time per image: {format_time(avg_time_per_image)}")
            print(f"├── Total progress: {processed_count}/{total_images} ({(processed_count/total_images*100):.1f}%)")
            print(f"├── Total elapsed time: {format_time(elapsed_time)}")
            print(f"└── Estimated time remaining: {format_time(estimated_remaining_time)}")

        # Clear CUDA cache after each batch if using GPU
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # Print final summary
    total_time = time.time() - start_time
    print("\n=== Processing Complete ===")
    print(f"Total time: {format_time(total_time)}")
    print(f"Images processed: {processed_count}/{total_images}")
    print(f"Errors encountered: {errors_count}")
    if processed_count > 0:
        print(f"Average time per image: {format_time(total_time/processed_count)}")
        print(f"Results saved to: {output_jsonl}")
    else:
        print("Average time per image: N/A - no images were processed")
        print("No results were saved as no images were successfully processed")

if __name__ == "__main__":
    # Example usage
    input_folder = "path/to/your/images"
    output_jsonl = "image_descriptions.jsonl"

    # Create output file if it doesn't exist
    if not os.path.exists(output_jsonl):
        open(output_jsonl, 'w').close()

In [None]:
  # Example usage
input_folder = "/content/Images"
output_jsonl = "image_descriptions.jsonl"

  # Create output file if it doesn't exist
if not os.path.exists(output_jsonl):
    open(output_jsonl, 'w').close()

# Process images with specified batch size and optional max_images limit
# Adjust batch_size based on your GPU memory
process_images_in_folder(
    input_folder=input_folder,
    output_jsonl=output_jsonl,
    batch_size=5,  # Adjust based on your GPU memory
    max_images=None  # Set to None to process all images
)


=== Starting Image Processing Pipeline ===

Initializing model and processor...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model initialization complete!

Scanning for images...
Found 177 images to process
Found 10 previously processed images
167 images remaining to process

=== Starting Processing ===
Processing in batches of 5 images

Processing batch 1/34
Batch contains 5 images

Processing chem_18.50.png




Successfully processed chem_18.50.png

Processing bio4.7.png
Successfully processed bio4.7.png

Processing chem_7.18.png
Successfully processed chem_7.18.png

Processing chem_7.11.png
Successfully processed chem_7.11.png

Processing chem_6.19.png
Successfully processed chem_6.19.png

Batch Progress Update:
├── Time for this batch: 0h 2m 2s
├── Average time per image: 0h 0m 8s
├── Total progress: 15/177 (8.5%)
├── Total elapsed time: 0h 2m 11s
└── Estimated time remaining: 0h 23m 41s

Processing batch 2/34
Batch contains 5 images

Processing chem_7.28.png
Successfully processed chem_7.28.png

Processing chem_7.24.png
Successfully processed chem_7.24.png

Processing chem_6.25.png
Successfully processed chem_6.25.png

Processing chem_6.22.png
Successfully processed chem_6.22.png

Processing chem_7.13.png
Successfully processed chem_7.13.png

Batch Progress Update:
├── Time for this batch: 0h 1m 59s
├── Average time per image: 0h 0m 12s
├── Total progress: 20/177 (11.3%)
├── Total elapsed 

In [None]:
from PIL import Image
# Test loading one image
test_path = list(Path(input_folder).glob('*'))[0]  # get first image
try:
    img = Image.open(test_path)
    print(f"Successfully loaded test image: {test_path}")
    print(f"Image size: {img.size}")
    print(f"Image mode: {img.mode}")
except Exception as e:
    print(f"Error loading test image: {e}")

Successfully loaded test image: /content/Images/bio4.5.png
Image size: (1480, 920)
Image mode: RGBA
