### Load module

In [1]:
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
import os

def load_text(fpaths, by_lines=False):
    with open(fpaths, "r") as fp:
        if by_lines:
            return fp.readlines()
        else:
            return fp.read()

def load_prompt(prompt):
    return load_text(f"prompts/{prompt}.txt")

  from .autonotebook import tqdm as notebook_tqdm


### Load model

In [2]:
model_name = "deepseek-vl-7b-chat"

# specify the path to the model
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(f"/nfs/turbo/coe-stellayu/clhsieh/Minecraft/ckpt/deepseek-ai/{model_name}", trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(f"deepseek-ai/{model_name}")
tokenizer = vl_chat_processor.tokenizer

Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.19s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


### Load data and define task

In [3]:
for image_name in ['barn_house.webp', 'castle_wall.webp', 'greek_house.webp', 'mg_nest.png', 'japanese_house.webp', 'incinerator.png', 'easy-0.png', 'easy-1.png', 'easy-2.png', 'easy-3.png', 'easy-4.png', 'easy-5.png']: # : # 
# image_name = 'greek_house.webp' # barn_house.webp, castle_wall.webp, greek_house.webp, mg_nest.png, japanese_house.webp, incinerator.png
    # image_name = f'easy-{i}.png'
    folder = 'easy' if 'easy' in image_name else 'mid'
    img_path = f"/nfs/turbo/coe-stellayu/clhsieh/Minecraft/data/{folder}/{image_name}"
    image_name_without_ext = os.path.splitext(os.path.basename(img_path))[0]


    # input_image = Image.open(img_path)

    output_dir = f"output/{model_name}/{folder}"
    os.makedirs(output_dir, exist_ok=True)
    output_file_path = f"{output_dir}/{image_name_without_ext}.txt"
    

    building_description_system = load_prompt("building_description_system")
    building_description_query = load_prompt("building_description_query")
    
    conversation = [
        {
            "role": "User",
            "content": f"<image_placeholder>{building_description_query}",
            "images": [f"{img_path}"]
        },
        {
            "role": "Assistant",
            "content": ""
        }
    ]
    # load images and prepare for inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation,
        images=pil_images,
        force_batchify=True
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=4800,
        do_sample=False,
        use_cache=True
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    print(f"{prepare_inputs['sft_format'][0]}", answer)

    

    
    # Save the output_text to the file
    with open(output_file_path, "w") as file:
        file.write(answer)

    

    # Display the image
    # display(Image(filename=img_path))

You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.

User: <image_placeholder>Please analyze the attached image of a Minecraft building. The image includes a random background—ignore the background and focus solely on the building. Assume the following:
	1.	The coordinate system is centered at (0, 0, 0) with y pointing upward.
	2.	A Minecraft character is 2 blocks tall—use this detail to infer scale and proportions.

Your output must be fully self-contained and include:

    1. Detailed Building Description
        •	Identify the exact types of blocks used in constructing the building. Use the official names from the following list of common building materials:
        •	stone
        •	cobblestone
        •	oak_planks
        •	spruce_planks
        •	birch_planks
        •	brick_block
        •	stone_bricks
        •	sandstone
        •	red_sandstone


### Process task