In [3]:
#!pip install --upgrade transformers==4.37.2

In [4]:
# Don't even know if these are necessary, but most models have required them
#!pip install einops accelerate timm

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
import math
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode

In [6]:
def split_model():
    device_map = {}
    world_size = torch.cuda.device_count()
    num_layers = 80
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map['language_model.model.rotary_emb'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

In [7]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

In [8]:
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

In [9]:
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

In [10]:
import requests
from io import BytesIO

def load_image(image_file, input_size=448, max_num=12):
    # Check if the input is a URL
    if isinstance(image_file, str) and image_file.startswith("http"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(img) for img in images]
    pixel_values = torch.stack(pixel_values)
    
    return pixel_values

In [11]:
path = "nvidia/NVLM-D-72B"
device_map = split_model()
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=False,
    trust_remote_code=True,
    device_map=device_map).eval()



config.json:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

configuration_nvlm_d.py:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/NVLM-D-72B:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/NVLM-D-72B:
- configuration_nvlm_d.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_nvlm_d.py:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

modeling_intern_vit.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/NVLM-D-72B:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nvidia/NVLM-D-72B:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/NVLM-D-72B:
- modeling_nvlm_d.py
- modeling_intern_vit.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/46 [00:00<?, ?it/s]

model-00001-of-00046.safetensors:   0%|          | 0.00/3.73G [00:00<?, ?B/s]

model-00002-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00003-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00004-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00005-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00006-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00007-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00008-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00009-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00010-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00011-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00012-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00013-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00014-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00015-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00016-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00017-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00018-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00019-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00020-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00021-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00022-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00023-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00024-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00025-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00026-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00027-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00028-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00029-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00030-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00031-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00032-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00033-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00034-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00035-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00036-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00037-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00038-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00039-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00040-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00041-of-00046.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model-00042-of-00046.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00043-of-00046.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00044-of-00046.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00045-of-00046.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00046-of-00046.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=1024, do_sample=False)



tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
urls = [
 'https://samplebook.photos/img/4786a.jpg',
 'https://samplebook.photos/img/284v.jpg',
 'https://samplebook.photos/img/3193c.jpg',
 'https://samplebook.photos/img/948d.jpg',
 'https://samplebook.photos/img/3191i.jpg',
 'https://samplebook.photos/img/4788oo.jpg',
 'https://samplebook.photos/img/2086d.jpg',
 'https://samplebook.photos/img/4781r.jpg',
 'https://samplebook.photos/img/5521n.jpg',
 'https://samplebook.photos/img/2073cc.jpg',
 'https://samplebook.photos/img/2085b.jpg',
 'https://samplebook.photos/img/4782c.jpg',
 'https://samplebook.photos/img/4783v.jpg',
 'https://samplebook.photos/img/4912gg.jpg',
 'https://samplebook.photos/img/304h.jpg',
 'https://samplebook.photos/img/2078i.jpg',
 'https://samplebook.photos/img/2077q.jpg',
 'https://samplebook.photos/img/3192h.jpg',
 'https://samplebook.photos/img/904x.jpg',
 'https://samplebook.photos/img/4792b.jpg',
 'https://samplebook.photos/img/5301p.jpg',
 'https://samplebook.photos/img/5207j.jpg',
 'https://samplebook.photos/img/1007p.jpg',
 'https://samplebook.photos/img/17l.jpg',
 'https://samplebook.photos/img/286t.jpg',
 'https://samplebook.photos/img/287g.jpg',
 'https://samplebook.photos/img/3189j.jpg',
 'https://samplebook.photos/img/4911d.jpg',
 'https://samplebook.photos/img/4785a.jpg',
 'https://samplebook.photos/img/4793a.jpg'
]

In [14]:
prompt = 'There are two objects in this image. One is a color checker, and the other is a photographic sample book, open to a page containing a photograph. Give a detailed description of the photograph.'

In [15]:
def get_response(url):
    pixel_values = load_image(url, max_num=12).to(torch.bfloat16)
    question = f'<image>\n{prompt}'
    response = model.chat(tokenizer, pixel_values, question, generation_config)

    return response

In [16]:
responses_nvlm = {}

for url in urls:
    response = get_response(url)
    print(url,"\n",response,"\n\n")
    responses_nvlm[url] = {'nvidia/NVLM-D-72B':response}

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4786a.jpg 
 The photograph in the book is a black-and-white image of a landscape. The landscape is dominated by a large, rocky outcrop in the foreground, with a few smaller rocks scattered around it. The sky is clear, and there are no visible clouds. The photograph is labeled "GEVAERT RIDAX 8" in the bottom right corner. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/284v.jpg 
 The photograph depicts a power station, specifically a substation. The substation is equipped with high-voltage power lines and transformers. The power lines are supported by tall metal towers, and the transformers are large, cylindrical, and white. The substation is enclosed by a fence, and the background features a hilly landscape. The photograph is in black and white, and the image quality is sharp and clear. The photograph is labeled "VARILOR" and "WHITE GLOSSY," indicating the type of photographic paper used. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/3193c.jpg 
 The photograph depicts a room with a chandelier hanging from the ceiling. The chandelier is the focal point of the image, and it is surrounded by various pieces of furniture, including a sofa, chairs, and a table. The room appears to be well-decorated, with framed pictures hanging on the walls and a potted plant in the corner. The overall atmosphere of the room is one of elegance and sophistication. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/948d.jpg 
 The photograph depicts a serene landscape scene, likely taken in a forest or a park. The image is in black and white, giving it a timeless and classic feel. The photograph captures a grove of trees, with their trunks and branches prominently featured. The trees appear to be tall and slender, with some of them reaching up towards the sky, while others are slightly bent or angled. The foliage is dense, with leaves creating a canopy that filters the light, casting dappled shadows on the ground below. The ground is covered with grass and underbrush, and there are no visible signs of human presence, such as buildings or pathways. The overall composition of the photograph is balanced and harmonious, with the trees and foliage creating a natural frame around the scene. The photograph conveys a sense of tranquility and connection to nature. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/3191i.jpg 
 The photograph depicts a fountain in a public square. The fountain is constructed of stone and features a cascading water display. The water is captured in mid-flow, creating a sense of movement and energy in the image. The fountain is surrounded by a paved area, and in the background, there are buildings with classical architectural elements, such as columns and arches. The overall atmosphere of the photograph is one of tranquility and historical significance. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4788oo.jpg 
 The photograph depicts a man in a black and white image. He is wearing a hat and a suit, and he is smoking a pipe. The photograph is in a book, and the page is labeled "ADLUX." 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/2086d.jpg 
 The photograph depicts a man and a woman in a close embrace, with the man's head slightly tilted back and the woman's head resting on his chest. The man's left hand is positioned on the woman's back, while his right hand is raised, possibly to adjust his glasses. The woman's left hand is not visible, but her right hand is placed on the man's chest. The background of the photograph is not clearly visible, but it appears to be a dark, possibly blurred setting. The photograph is printed on Kodak Ektafine Paper X, as indicated by the label on the bottom right corner of the page. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4781r.jpg 
 The photograph depicts a man seated at a desk, with a pen in his right hand and a piece of paper in front of him. He is wearing a white shirt and a tie. The background is a plain wall, and there are no other objects in the image. The photograph is in black and white. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/5521n.jpg 
 The photograph depicts a wine barrel, a cylindrical container used to store wine. The barrel is made of wood and has metal bands around it to hold the staves together. The photograph is in black and white, and the lighting is such that the barrel casts a shadow on the ground behind it. The photograph is printed on a page of the sample book, which is open to this page. The page is labeled "Kodak Medalist Paper F" and indicates that the photograph is available in both SW (slow-speed) and DW (double-weight) formats. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/2073cc.jpg 
 The photograph in the book is a black-and-white aerial view of a prison. The prison is surrounded by a high wall, and there are several buildings within the compound. The buildings are arranged in a way that suggests a central courtyard or exercise area. The photograph is labeled "KODAK BROMIDE BASED WATERPROOF PAPER." 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/2085b.jpg 
 The photograph depicts a person standing on a surface covered with small rocks. The person is wearing a short-sleeved shirt and shorts. They are holding a stick in their right hand, which is raised above their head. The person's left hand is not visible in the photograph. The photograph is in black and white. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4782c.jpg 
 The photograph depicts a man seated on a boat. The boat is equipped with a sail, which is currently furled. The man is wearing a long, loose garment, and a turban or headscarf. He is holding onto the boat's railing with one hand, and there is a paddle resting on his lap. The background of the photograph shows a body of water, and the boat appears to be stationary. The photograph is in black and white, and the lighting suggests that it was taken during the day. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4783v.jpg 
 The photograph depicts a sculpture of an elephant, which is positioned in the foreground. The sculpture is made of a material that appears to be stone or a similar substance, and it is highly detailed, capturing the texture and form of the elephant. The sculpture is placed on a surface that is patterned with a grid of squares, which may be a part of the base or plinth on which the sculpture rests. In the background, there is a vase with a distinctive design, featuring a pattern of curved lines and shapes. The vase is positioned slightly to the left of the center of the image and is partially obscured by the elephant sculpture. The overall composition of the photograph is carefully arranged, with the objects placed in a way that creates a sense of balance and harmony. The lighting in the photograph is soft and even, highlighting the details of the sculpture and the vase without creating harsh shadows or reflections. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4912gg.jpg 
 The photograph depicts a dam, with water cascading over its crest. The dam is constructed of concrete and features a curved design, which is a common characteristic of such structures. The water flowing over the dam creates a misty effect, indicating the force and volume of the water. The photograph is in black and white, which adds a timeless quality to the image. The dam's structure and the water's movement are the main focal points of the photograph. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/304h.jpg 
 The photograph depicts a man operating a printing press. The man is positioned to the right of the image, with his back facing the camera. He is wearing a white shirt and dark trousers. The printing press is a large, industrial machine with various levers, buttons, and dials. The machine is predominantly black, with some silver and metallic parts. The background of the image is a plain, light-colored wall. The photograph is in black and white, and the lighting is even, with no harsh shadows or bright highlights. The overall mood of the image is one of industry and hard work. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/2078i.jpg 
 The photograph depicts a white object with a rounded top and a cylindrical base. The object is positioned against a plain background, which appears to be a gradient of light to dark gray. The lighting in the photograph is even, with no harsh shadows or highlights, suggesting the use of diffused light. The photograph is in black and white, which emphasizes the contrast between the object and the background. The object's surface appears smooth and slightly reflective, indicating that it may be made of a glossy material such as plastic or ceramic. The photograph is sharp and in focus, with clear details visible on the object's surface. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/2077q.jpg 
 The photograph depicts a person standing on a snowy mountain. The person is wearing a hat and a long coat, and they are holding a walking stick. The mountain is covered in snow, and there are other mountains in the background. The sky is clear, and the sun is shining. The photograph is in black and white, and it has a vintage look to it. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/3192h.jpg 
 The photograph depicts a scene viewed through an archway. The archway is constructed of stone and features a circular top. Beyond the archway, a tall, slender tower is visible in the distance. The tower is also made of stone and has a pointed top. The sky is partly cloudy, with patches of blue visible between the clouds. The ground in the foreground is covered with grass, and a path leads from the archway to the tower. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/904x.jpg 
 The photograph depicts a winter scene, with snow blanketing the ground and trees. The trees are leafless, indicating that it is winter. The snow appears to be fresh, as there are no visible footprints or tire tracks. The sky is overcast, suggesting that it may continue to snow. The photograph is in black and white, giving it a timeless quality. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4792b.jpg 
 The photograph depicts a table with a white tablecloth, on which a black plate is placed. The plate is adorned with intricate patterns, and a black knife and fork are positioned on either side of the plate. The table is surrounded by chairs with a similar design, featuring a combination of straight and curved lines. The overall composition of the photograph is symmetrical, with the plate and utensils centered on the table. The black and white color scheme creates a stark contrast, highlighting the details of the objects. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/5301p.jpg 
 The photograph depicts a woman with a vase on her head. The vase is adorned with a floral pattern, and the woman's hair is styled in an updo, with a few strands falling loose. She is wearing a necklace, and her expression is neutral. The background is blurred, with soft lighting that creates a dreamy atmosphere. The overall composition of the photograph is balanced, with the woman's head and the vase positioned in the center of the frame. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/5207j.jpg 
 The photograph depicts a black-and-white image of a cat, which is the primary subject. The cat is positioned in the center of the image, with its head slightly tilted to the right. The cat's eyes are open and appear to be looking directly at the camera. The background of the image is blurred, with indistinct shapes and shadows that suggest an indoor setting. The photograph is printed on a high-quality paper that enhances the contrast and sharpness of the image. The overall composition of the photograph is balanced, with the cat's position and gaze drawing the viewer's attention to the center of the image. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/1007p.jpg 
 The photograph depicts a young child standing on a surface. The child is dressed in a short-sleeved shirt and shorts, with socks and shoes. The child's hair is curly and appears to be blonde. The background of the photograph is plain and unobtrusive, likely a studio setting. The photograph is in black and white, and the lighting is even, with no harsh shadows or bright highlights. The child's expression is one of curiosity or interest, as they look down at their feet. The photograph is sharp and clear, with good contrast between the light and dark areas. Overall, the photograph captures a candid moment of a young child exploring their surroundings. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/17l.jpg 
 The photograph depicts a woman standing in a relaxed pose, with her right hand on her hip and her left hand holding a hose. She is wearing a short, sleeveless dress that appears to be white. The background of the photograph is a simple, unobtrusive setting that does not distract from the subject. The lighting in the photograph is even and natural, highlighting the woman's features and the details of her dress. The photograph is well-composed, with the woman centered in the frame and the hose leading the viewer's eye to her. Overall, the photograph is a well-executed portrait of a woman in a casual, everyday setting. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/286t.jpg 
 The photograph depicts a woman standing outdoors, dressed in a white dress. She is positioned in front of a white picket fence, which is partially open, revealing a brick wall behind it. The background features a lush, green garden with various plants and trees. The woman is holding onto the fence with one hand, and her other hand is resting on her hip. She is looking directly at the camera, with a slight smile on her face. The photograph is in black and white, and the lighting appears to be natural, with soft shadows. The overall mood of the photograph is calm and serene. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/287g.jpg 
 The photograph depicts two individuals, one male and one female, both dressed in white pants and black tops. The woman is seated on a windowsill, with her legs crossed and her head resting on her hand. The man is standing next to her, leaning against the wall with his arms crossed. The background features a large window with a grid pattern, and the overall tone of the image is black and white. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/3189j.jpg 
 The photograph depicts a dirt road that stretches into the distance, flanked by grassy fields. The road curves slightly as it approaches the horizon, where it meets a line of hills or mountains. The sky is clear, with no visible clouds. There are no vehicles or people on the road, and no buildings or other structures are visible. The overall impression is of a rural, isolated area. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4911d.jpg 
 The photograph depicts a rustic scene, likely from a rural setting. In the foreground, there is a thatched-roof hut, suggesting traditional construction methods. The thatched roof is a key feature, indicating a style of architecture that utilizes natural materials for insulation and weather protection. The hut is surrounded by a few trees, which provide a sense of seclusion and connection to nature. The trees are relatively tall and have a sparse arrangement, allowing sunlight to filter through and illuminate the scene. The ground appears to be a dirt path or open area, with no visible vegetation, indicating a well-trodden or cleared space. The overall atmosphere of the photograph is serene and pastoral, capturing a moment of simplicity and tranquility in a rural environment. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4785a.jpg 
 The photograph depicts a woman with a serene expression, her head slightly tilted to the side. She is adorned with a headdress that appears to be made of feathers, adding an element of elegance to her appearance. The background of the photograph is blurred, ensuring that the focus remains solely on the woman. The image is framed within a border, and the page it is placed on is slightly yellowed, suggesting that the book has been in use for some time. 




Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


https://samplebook.photos/img/4793a.jpg 
 The photograph depicts a man in a striped shirt, who appears to be in a state of distress. He is holding a bottle, which he is about to throw. The man is wearing a hat, and his facial expression suggests that he is experiencing a great deal of pain. 




In [17]:
import pickle

In [18]:
with open("responses_nvlm.pkl", "wb") as f:
    pickle.dump(responses_nvlm, f)