https://internvl.github.io \
https://github.com/OpenGVLab/InternVL \
https://huggingface.co/OpenGVLab/InternVL3-8B

In [1]:
import os
import sys

In [2]:
# Set CUDA_VISIBLE_DEVICES to expose only device 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Set CUDA_VISIBLE_DEVICES to expose devices 0, 1, 2, and 3
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [3]:
import math
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

In [5]:
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

In [6]:
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

In [7]:
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


In [8]:
def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


In [9]:
model = AutoModel.from_pretrained(
    "OpenGVLab/InternVL3-8B",
    torch_dtype=torch.bfloat16,
    load_in_8bit=False,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map="auto")

model = model.cuda().eval()

Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.16s/it]


In [10]:
tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL3-8B", trust_remote_code=True, use_fast=False)

#### Single Image Inference

In [12]:
# set the max number of tiles in `max_num`
pixel_values = load_image('../DeepSeek-VL2/images/visual_grounding_1.jpeg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

In [None]:
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image shortly.
Assistant: The image shows two giraffes standing in a field with green grass and trees in the background, under a clear blue sky.


In [14]:
# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image in detail.
Assistant: The image features two giraffes standing on a grassy plain. The taller giraffe is prominently positioned in the foreground, with distinctive brown and white patterned fur, while the second giraffe appears slightly behind it. The scene is set against a backdrop of trees, indicating a natural habitat or wildlife reserve. The sky is clear and blue, suggesting a bright, sunny day. The overall setting gives off a serene and wide-open savanna atmosphere.


#### Multi-Image Multi-Round Conversation

In [18]:
# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
pixel_values1 = load_image('../DeepSeek-VL2/images/visual_grounding_1.jpeg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('../DeepSeek-VL2/images/visual_grounding_2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: Image-1: <image>
Image-2: <image>
Describe the two images in detail.
Assistant: Image-1: In the image, two giraffes stand in a verdant field with a backdrop of trees and a clear blue sky. The giraffe on the left is taller and closer to the camera, showcasing its distinctive brown and white spotted pattern and long neck. The giraffe on the right is slightly shorter and positioned behind the first, also displaying the characteristic spotted pattern. Their long legs extend down to the lush green grass, and they appear to be in a natural, open environment. In the distant background, other giraffes can be faintly seen, suggesting this might be a wildlife reserve or a protected area.

Image-2: In the image, two cartoon characters, labeled "我" (me) and "导" (director), are engaged in a conversation. The character labeled "我" appears calm and is smiling, while the character labeled "导" looks visibly annoyed, as indicated by the red lines on his face and the flames around his head. A speec

#### Batch Inference

In [24]:
pixel_values1 = load_image('../DeepSeek-VL2/images/visual_grounding_1.jpeg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('../DeepSeek-VL2/images/visual_grounding_2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)

responses = model.batch_chat(tokenizer, pixel_values,
                             num_patches_list=num_patches_list,
                             questions=questions,
                             generation_config=generation_config)

for question, response in zip(questions, responses):
    print(f'User: {question}\nAssistant: {response}')

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Describe the image in detail.
Assistant: The image features two giraffes standing in a grassy field. These giraffes have tall, slender necks and distinct, patchy brown and white coats, which are characteristic of their species. They are positioned against a backdrop of lush, green grass and a line of trees in the distance. The sky is clear and blue, indicating what seems to be a sunny day. In the background, some other animals can also be faintly visible, adding to the natural setting of the scene. The lighting suggests it might be early morning or late afternoon, creating long shadows on the ground.
User: <image>
Describe the image in detail.
Assistant: The image is a cartoon illustration featuring two characters standing side by side, dressed in professional attire with suits and ties. 

1. **Person on the Left**:
   - Wears a dark blazer, white shirt, and no tie.
   - Smiles with a relaxed, calm expression.
   - A speech bubble next to the head of this character reads 