### Load module

In [1]:




# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
# image = Image.open(requests.get(url, stream=True).raw)




In [2]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os

def load_text(fpaths, by_lines=False):
    with open(fpaths, "r") as fp:
        if by_lines:
            return fp.readlines()
        else:
            return fp.read()

def load_prompt(prompt):
    return load_text(f"prompts/{prompt}.txt")

  from .autonotebook import tqdm as notebook_tqdm


### Load model

In [3]:
model_name = "Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    f"/nfs/turbo/coe-stellayu/clhsieh/Minecraft/ckpt/meta-llama/{model_name}",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(f"meta-llama/{model_name}")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 5/5 [00:08<00:00,  1.69s/it]


### Load data and define task

In [4]:
for image_name in ['barn_house.webp', 'castle_wall.webp', 'greek_house.webp', 'mg_nest.png', 'japanese_house.webp', 'incinerator.png', 'easy-0.png', 'easy-1.png', 'easy-2.png', 'easy-3.png', 'easy-4.png', 'easy-5.png']: # 
# image_name = 'greek_house.webp' # barn_house.webp, castle_wall.webp, greek_house.webp, mg_nest.png, japanese_house.webp, incinerator.png
    # image_name = f'easy-{i}.png'
    folder = 'easy' if 'easy' in image_name else 'mid'
    img_path = f"/nfs/turbo/coe-stellayu/clhsieh/Minecraft/data/{folder}/{image_name}"
    image_name_without_ext = os.path.splitext(os.path.basename(img_path))[0]
    input_image = Image.open(img_path)

    output_dir = f"output/{model_name}/{folder}"
    os.makedirs(output_dir, exist_ok=True)
    output_file_path = f"{output_dir}/{image_name_without_ext}.txt"
    

    building_description_system = load_prompt("building_description_system")
    building_description_query = load_prompt("building_description_query")
    
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": building_description_system},
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": input_image,
                },
                {"type": "text", "text": building_description_query},
            ],
        },
    ]

    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(
        input_image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)

    output = model.generate(**inputs, max_new_tokens=4800)
    print(processor.decode(output[0]))

    
    # Save the output_text to the file
    with open(output_file_path, "w") as file:
        file.write(processor.decode(output[0]))

    

    # Display the image
    # display(Image(filename=img_path))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Apr 2025

[{'type': 'text', 'text': 'You are a Vision-Language model designed to analyze Minecraft structures using an xyz coordinate system, where y points upward toward the sky. When given an image of a Minecraft building (with a random background), your task is to produce a self-contained, structured output that allows for complete reconstruction of the building without referencing the original image.\n\nKey Guidelines:\n\t1.\tCoordinate System Awareness:\n\n\t•\tUse an xyz coordinate system centered at (0, 0, 0), with y pointing upward.\n\t•\tExpress all spatial references, dimensions, and positions using this system.\n\n\t2.\tScale Awareness:\n\n\t•\tA typical Minecraft character is 2 blocks tall. Use this detail to estimate and infer the proportions and scale of the building components.\n\n\t3.\tDetailed Building Description:\n\n\t•\tIdentify all visible blocks/materi

### Process task