In [27]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


In [28]:
import math
import torch
from transformers import AutoTokenizer, AutoModel

def split_model(model_name):
    device_map = {}
    print(torch.cuda .is_available())
    world_size = torch.cuda.device_count()
    print(world_size)
    num_layers = {
        'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
        'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

In [29]:
path = "pretrained/InternVL2-2B"
device_map = split_model('InternVL2-2B')
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    load_in_8bit=True,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map=device_map).eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=True)



True
1


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
# set the max number of tiles in `max_num`
pixel_values = load_image('./images/3.jpg', max_num=12, input_size=448).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=32, do_sample=False)
print(pixel_values.shape)

torch.Size([10, 3, 448, 448])


In [31]:
# single-image single-round conversation (单图单轮对话)
question = '<image>\nin the given image there is a product, find the height of the product from the text data available in the image, only use the text data from the image, the answer is written in a straighforward manner and there is no need to do arithmetic on the different numbers present in the image.'
question = '<image>\nin the given image, from the text present in the image output only the numeric weight with units, do not perform any arithmetic operations on the numbers present in the image, just return the final answer, the answer may be a range of values like a-b <unit> so answer accordingly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')

User: <image>
in the given image, from the text present in the image output only the numeric weight with units, do not perform any arithmetic operations on the numbers present in the image, just return the final answer, the answer may be a range of values like a-b <unit> so answer accordingly.
Assistant: The numeric weight with units in the image is 17.21 kg.


In [None]:
entity_unit_map = {
  "width": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "depth": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "height": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "item_weight": {
    "milligram",
    "kilogram",
    "microgram",
    "gram",
    "ounce",
    "ton",
    "pound"
  },
  "maximum_weight_recommendation": {
    "milligram",
    "kilogram",
    "microgram",
    "gram",
    "ounce",
    "ton",
    "pound"
  },
  "voltage": {
    "millivolt",
    "kilovolt",
    "volt"
  },
  "wattage": {
    "kilowatt",
    "watt"
  },
  "item_volume": {
    "cubic foot",
    "microlitre",
    "cup",
    "fluid ounce",
    "centilitre",
    "imperial gallon",
    "pint",
    "decilitre",
    "litre",
    "millilitre",
    "quart",
    "cubic inch",
    "gallon"
  }
}

In [None]:
import pandas as pd
import os

# Path to the folder where all images are stored
image_folder = "path/to/the/folder"  # Adjust this path based on where the images are stored

# Lists to store image paths and query_entities
image_paths = []
query_entities = []

# Read the CSV file using pandas
csv_file = 'path/to/the/csvfile.csv'  # Replace with your actual CSV file name
df = pd.read_csv(csv_file)

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    image_index = row[0]  # First column: index
    query_entity = row[3]  # Third column: query_entity

    # Construct the image filename using the index (e.g., index.jpg)
    image_name = f"{image_index}.jpg"
    
    # Construct the full image path (all images are in a single folder)
    image_path = os.path.join(image_folder, image_name)
    
    if os.path.exists(image_path):  # Check if the image exists in the folder
        image_paths.append(image_path)  # Add the image path to the list
        query_entities.append(query_entity)  # Add the query_entity to the list
    else:
        print(f"Image {image_name} not found.")

In [None]:
generation_config = dict(max_new_tokens=32, do_sample=False)

In [None]:
# prompt_type = 'none'
prompts = {
    "width": f"<image>\n In the given image, find the width of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "depth": f"<image>\n In the given image, find the depth of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "height": f"<image>\n In the given image, find the height of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "item_weight": f"<image>\n In the given image, find the item_weight of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "maximum_weight_recommendation": f"<image>\n In the given image, find the maximum_weight_recommendation of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "voltage": f"<image>\n In the given image, find the voltage of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "wattage": f"<image>\n In the given image, find the wattage of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. ",
    "item_volume": f"<image>\n In the given image, find the item_volume of the product using only the numerical text present in the image, the units are {{prompt_type}}, output only the number with it unit, the answer may be a range. "
}

In [None]:
responses_dict = {}

for image_path, query_entity in zip(image_paths, query_entities):
    # Finding the image name from image path
    image_name = os.path.basename(image_path).split('.')[0]
    # set the max number of tiles in `max_num`
    pixel_values = load_image(image_path, max_num=12, input_size=448).to(torch.bfloat16).cuda()
    prompt_type = entity_unit_map[query_entity]
    # Defining the question
    question = prompts[query_entity].replace('{{prompt_entity}}', prompt_type)
    # Getting the model's response
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    # Storing the model's response in a dictionary
    responses_dict[image_name] = response