In [1]:
%pip install transformers==4.37.2


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch torchvision Pillow peft accelerate sentencepiece timm flash_attn einops tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
import os
import base64
import pandas as pd
import tqdm


#### 2. Constants for ImageNet Mean and Standard Deviation
constants are used to normalize the image as per the ImageNet dataset standards.



In [3]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

#### 3. Building the Transform Function
function converts images to RGB, resizes them, converts them to tensors, and normalizes them.

In [4]:
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform


#### 4. Finding the Closest Aspect Ratio
function finds the closest aspect ratio to the given aspect ratio from a set of target ratios.

In [5]:
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


In [6]:
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # Calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # Find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # Calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # Resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # Split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


#### 6. Loading and Transforming the Image

function loads an image from a file, preprocesses it, and applies transformations to it.



In [7]:
# Hyperparameters
parameter_size = torch.float16
questions = "Given the image, analyze and provide the following details in a continuous text format if the details are not visible in the picture, please leave them out of the description:  1.General description: Provide a general description of the image.  2. Driving side: Determine if vehicles are driving on the left or right side of the road. Note the position of any visible traffic signs. 3. Road line color: Identify the color of the center and edge lines on the road. 4. Sign color: Describe the color and shape of any visible traffic signs. 5. Road condition: Describe the condition of the road surface. Note any signs of wear, damage, or maintenance. 6. Biome: Describe the biome visible in the image (e.g., desert, forest, grassland, urban). 7. Languages: Look at the picture, are there letters, and if so, which language do they come from? 7a. If you see letters, which alphabet do they come from and can you say what they say? Return the results in the following text format: \"The image shows a scene [general description of the image]. Vehicles are driving on the [left/right] side of the road. The road has [yellow/white/green, solid/dashed/no lines] center lines and [yellow/white/green, solid/dashed/no lines] edge lines. Visible traffic signs are [color and shape, distinctive features]. License plates are [present/absent], characterized by their [color], [shape], and positioned at the [front/back] of vehicles. The road surface appears [describe condition], indicating [describe maintenance or wear]. The biome visible in the image is [describe biome]. There  are letters from [language]. The letters are from [Latin, Greek, Cyrillic, Chinese, Japanese, Korean, Devanagari, Arabic, Hebrew, Bengali] and it says [].\""

In [8]:
def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


In [9]:
path = "OpenGVLab/Mini-InternVL-Chat-4B-V1-5"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=parameter_size,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### 8. Loading Image and Running Inference
loads the image, processes it, and interacts with the model to generate responses

In [15]:
# generation config
generation_config = dict(
    num_beams=1,
    max_new_tokens=256,
    do_sample=False,
)

folder_path = '00'
result_folder = 'results'
image_files = [f for f in os.listdir(folder_path) if f.endswith(('.jpg'))]

# hyperparameters
batch_size = 2
num_images = len(image_files)
counter = 0
responses_to_save = []
num_files = 0
# Indices for start and end
start_index = 0
end_index = 120 

In [11]:
torch.cuda.empty_cache()

In [16]:
# Inference Loop

# batch inference (single image per sample)
for i in range(start_index, min(end_index, num_images), batch_size):
    # Get the current batch of image files
    current_batch_files = image_files[i:i+batch_size]

    # Image loading batched
    batch_pixel_values = []
    for image_file in current_batch_files:
        image_path = os.path.join(folder_path, image_file)
        pixel_values = load_image(image_path, max_num=6).to(parameter_size).cuda()
        batch_pixel_values.append(pixel_values)

    # batch preparation
    image_counts = [x.size(0) for x in batch_pixel_values]
    pixel_values = torch.cat(batch_pixel_values, dim=0)

    # prompt the model
    responses = model.batch_chat(tokenizer, pixel_values,
                             image_counts=image_counts,
                             questions=questions,
                             generation_config=generation_config)
    
    responses_to_save.append({'image_file': img_file, 'response': resp} for img_file, resp in zip(current_batch_files, responses))
    counter += 1

    # every 50 batches save the generated answers 
    if counter >=50:
            # Convert results to DataFrame
            df = pd.DataFrame(responses_to_save)
            # Save the results to a CSV file
            df.to_csv(f'{result_folder}/{folder_path}_image_descriptions'+str(num_files)+'.csv', index=False)
            # reset df
            responses_to_save = []
            # reset counter
            counter = 0
            num_files += 1

#saves rest dataframe
if responses_to_save:
    df = pd.DataFrame(responses_to_save)
    df.to_csv(f'{result_folder}/{folder_path}_image_descriptions' + str(num_files) + " " + str(i) + '.csv', index=False)

#returns last processed index of the image
last_processed_index = min(end_index, num_images)
print("Last processed index:", last_processed_index)

Last processed index: 120
