In [3]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch

# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-8B-Instruct", dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen3VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen3-VL-8B-Instruct",
#     dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 110376.42it/s]
Loading weights: 100%|██████████| 750/750 [00:02<00:00, 350.72it/s, Materializing param=model.visual.pos_embed.weight]                                 
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


['This is a heartwarming and serene photograph capturing a moment of connection between a woman and her dog on a beach at sunset.\n\n**Key Elements:**\n\n*   **The Subjects:** A woman and a yellow Labrador Retriever are the central focus.\n*   **The Woman:** She is sitting cross-legged in the sand, wearing a plaid shirt and dark pants. She has long, dark hair and is smiling warmly, looking at the dog with affection. Her right hand is extended, and the dog is gently placing its paw on her hand, likely as part of a training exercise or a playful interaction. She is also holding a']


In [4]:
image_url = "/home/majd/Documents/Projects/paper-to-poster-finetuning/images/0cdf61037d7053ca59347ab230818335.png"

In [9]:
from PIL import Image
image_path = "/home/majd/Documents/Projects/paper-to-poster-finetuning/images/0cdf61037d7053ca59347ab230818335.png"

# Load image with PIL first to ensure it's valid
image = Image.open(image_path)
print(f"Image size: {image.size}, mode: {image.mode}")

Image size: (3456, 2304), mode: RGB


In [10]:
user_prompt = """You are a precise layout analysis system for academic posters. Your task is to extract structural and spatial information that enables exact LaTeX reconstruction.

Focus on:
- Grid structure and column layout
- Section positions and dimensions (as percentages)
- Section styling (colors, borders, headers)
- Figure types and arrangements within sections
- Visual hierarchy and reading flow

Do NOT transcribe text content - only capture structure.

Output valid JSON following the schema exactly.

Analyze this academic poster's layout structure. Output JSON following this schema:

{
  "poster": {
    "orientation": "landscape|portrait",
    "aspect_ratio": "e.g., 16:9, 4:3, 1.41:1",
    "background": "#hex"
  },
  "header": {
    "height_pct": number,
    "background": "#hex|gradient|none",
    "gradient_colors": ["#hex1", "#hex2"] or null,
    "title_alignment": "left|center",
    "logo_positions": ["left", "right", "none"],
    "has_author_affiliation_superscripts": boolean
  },
  "footer": {
    "present": boolean,
    "height_pct": number or null,
    "background": "#hex|gradient|none",
    "content": "contact|references|none"
  },
  "body": {
    "columns": number,
    "column_widths": ["equal"] or ["30%", "40%", "30%"],
    "gutter_pct": number
  },
  "sections": [
    {
      "id": number,
      "title": "Section Title Text",
      "column": number,
      "column_span": number,
      "row_in_column": number,
      "height_pct": number,
      "style": {
        "header_bg": "#hex|none",
        "header_text_color": "#hex",
        "body_bg": "#hex|transparent",
        "border": "#hex|none",
        "border_radius": "none|small|medium|large"
      },
      "content_type": "text|bullets|figure|table|equation|flowchart|mixed",
      "content_layout": {
        "arrangement": "vertical|horizontal|grid",
        "split": "description if mixed",
        "figure_count": number or null,
        "has_equations": boolean
      }
    }
  ],
  "figures": [...],
  "flowcharts": [...],
  "special_elements": [...],
  "color_scheme": {...},
  "reading_order": "columns-left-to-right|rows-top-to-bottom|numbered|arrows"
}"""

# Replace with your image path or URL
image_url = "YOUR_POSTER_IMAGE_URL_HERE"
# Or for local file:
# image_path = "/path/to/your/poster.png"

# Prepare messages - content must always be a list of dicts for Qwen2-VL
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image,
            },
            {
                "type": "text", 
                "text": user_prompt
            },
        ],
    }
]
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
# Preparation for inference
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference
generated_ids = model.generate(
    **inputs, 
    max_new_tokens=4096,
    temperature=0.1,
    do_sample=True
)

generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

output_text = processor.batch_decode(
    generated_ids_trimmed, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False
)

print(output_text[0])

```json
{
  "poster": {
    "orientation": "landscape",
    "aspect_ratio": "16:9",
    "background": "#FFFFFF"
  },
  "header": {
    "height_pct": 10,
    "background": "#FFFFFF",
    "gradient_colors": null,
    "title_alignment": "center",
    "logo_positions": ["left", "right"],
    "has_author_affiliation_superscripts": true
  },
  "footer": {
    "present": true,
    "height_pct": 5,
    "background": "#FFFFFF",
    "content": "references"
  },
  "body": {
    "columns": 3,
    "column_widths": ["30%", "40%", "30%"],
    "gutter_pct": 5
  },
  "sections": [
    {
      "id": 1,
      "title": "Introduction",
      "column": 1,
      "column_span": 1,
      "row_in_column": 1,
      "height_pct": 20,
      "style": {
        "header_bg": "#0033A0",
        "header_text_color": "#FFFFFF",
        "body_bg": "#FFFFFF",
        "border": "#0033A0",
        "border_radius": "small"
      },
      "content_type": "text",
      "content_layout": {
        "arrangement": "vertical",
   