In [None]:
import torch
import requests
from io import BytesIO
from PIL import Image
from torchvision import transforms
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.float32,  # Use float32 explicitly
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

In [None]:
def load_and_transform_images(image_paths):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to match model input
        transforms.ToTensor(),  # Convert image to tensor
    ])

    images = []
    for path in image_paths:
        image = Image.open(path).convert("RGB")  # Load image from local file
        image_tensor = transform(image)  # Convert to tensor
        image_pil = transforms.ToPILImage()(image_tensor)  # Convert back to PIL
        images.append(image_pil)
    
    return images


images = [
    "/root/workdir/NAVSIM/navsim/tutorial/back_view.jpg",
    "/root/workdir/NAVSIM/navsim/tutorial/front_view_2.jpg",
]

image_batch = load_and_transform_images(images)

messages = [
    [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text", "text": "Describe this image."},
        ],
    }]
    for img in image_batch
]

# Preparation for batch inference
texts = [
    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
    for msg in messages
]

image_inputs, video_inputs = process_vision_info(messages)


inputs = processor(
    text=texts,
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda", dtype=torch.float32)

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_texts = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)



The image depicts a street scene in an urban environment. The street is lined with palm trees on the left side, suggesting a warm climate. There are buildings on both sides of the street, with one building on the right side having a large sign that reads "Wynn." The street appears to be busy with several cars, including a white car in the foreground and other vehicles further down the road. The sky is clear, indicating a sunny day. The overall atmosphere of the image is vibrant and lively, typical of a bustling city street.
The image depicts a street scene in an urban environment. The foreground features a black SUV parked on the left side of the street. The street is lined with buildings on both sides, primarily made of brick and concrete. The buildings have a mix of architectural styles, with some appearing older and others more modern. 

The street itself is relatively narrow, with a crosswalk visible in the middle of the image. The crosswalk is marked with white lines and a pedestr

In [28]:
for t in output_texts:
    print("#"*100)
    print(t)

####################################################################################################
The image depicts a street scene in an urban environment. The street is lined with palm trees on the left side, suggesting a warm climate. There are buildings on both sides of the street, with one building on the right side having a large sign that reads "Wynn." The street appears to be busy with several cars, including a white car in the foreground and other vehicles further down the road. The sky is clear, indicating a sunny day. The overall atmosphere of the image is vibrant and lively, typical of a bustling city street.
####################################################################################################
The image depicts a street scene in an urban environment. The foreground features a black SUV parked on the left side of the street. The street is lined with buildings on both sides, primarily made of brick and concrete. The buildings have a mix of architectural style

In [None]:
from qwen_vl_utils import process_vision_info
import torch
from torch.cuda.amp import autocast
from transformers import AutoProcessor, AutoModelForImageTextToText

gpu_id=0
device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
#device="cpu"

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float16) # , torch_dtype=torch.float16
model.to(device)
model.eval()

# instruction = """You are an Autonomous Driving AI assistant. You receive a 1024*256 pixels image of the front view, from the driver's perspective.
# Your task is to analyze the images and provide insights based on the visual data. Focus on the followings:
# - traffic participants (vehicles, pedestrians etc.)
# - traffic lights with color
# As your result please use the following template:
# {{
#     traffic_participants: [
#         participant_1_name: {{
#             position: x1,y1 - x2,y2 
#         }},
#         participant_2_name: {{
#             position: x1,y1 - x2,y2 
#         }},
#         ...
#     ],
#     traffic_lights: [
#         light_1: {{
#             position: x1,y1 - x2,y2,
#             color:
#         }},
#         light_2: {{
#             position: x1,y1 - x2,y2,
#             color:
#         }}
#         ...
#     ]
# }}"""

instruction = """You receive an image from the driver's perspective. Your task is to describe the image in high details,
so the driver can make informed driving decisions. Only focus on objects related to driving, like vehicles, pedestrian traffic lights etc."""
#- any important objects that can affect to make inform driving decisions"""
messages = [
    { 
        "role": "system", 
        "content": "You are an Autonomous Driving AI assistant."
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "/root/workdir/NAVSIM/navsim/tutorial/front_view_2.jpg",
                "resized_height": 256,
                "resized_width": 1024,
            },
            {"type": "text", "text": f"{instruction}"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device)

# Inference: Generation of the output
with autocast():
    generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 5/5 [00:16<00:00,  3.39s/it]


['The image shows a city street intersection with several key elements related to driving:\n\n1. **Traffic Lights**: There are traffic lights at the intersection, with one visible in the center of the image. The light is currently red, indicating that vehicles must stop.\n\n2. **Pedestrian Crosswalk**: The intersection has a pedestrian crosswalk marked with white lines, indicating where pedestrians can cross the street safely.\n\n3. **Vehicles**: There is a black SUV parked on the left side of the image, and a few other vehicles are visible in the distance, including a white car and a black car.\n\n4. **Buildings**: The buildings on both sides of the street are multi-story and appear to be commercial or residential structures. The building on the left has large windows and a sign that is partially visible.\n\n5. **Street Signs**: There is a street sign visible in the center of the image, indicating the name of the street. The sign is partially obscured by the traffic light.\n\n6. **Ped

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
l = 10
example = [[random.uniform(0, 1) for _ in range(3)] for _ in range(10)]
# prompt = f"""Your task is to predict the optimal future waypoints of the vehicle, that you are driving, based on the description of current traffic conditions.
# Here is the description: {output_text}
# Your response should only be a list with exaclty {l} items, where each item is a 3 length long list, representing a single waypoint of float numbers (x, y, heading).
# Example format of response: {example}
# You have to return a {l}*3 flaot list even if you are not sure about the exact numbers. Please write the numbers, that you think alligns the best with the description."""

prompt = f"""Your task is to decide whether to slow down (1) or continue (0) based on traffic conditions.
Return only a single integer: 0 or 1"""
input_text = "Traffic is heavy, the road is wet."

messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": input_text}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)

RuntimeError: "triu_tril_cuda_template" not implemented for 'BFloat16'

In [15]:
example

[[0.4616067224992072, 0.573124888842575, 0.9154594068375469],
 [0.05076615814367358, 0.3129030140499487, 0.24719893777321644],
 [0.44775105109613245, 0.4262400593326674, 0.043178324234017684],
 [0.32509374898888066, 0.7139883520707607, 0.32776340405167925],
 [0.5171830763702864, 0.2185651650523609, 0.6094962197543433],
 [0.2815316519949732, 0.3850637283156816, 0.8781078240564743],
 [0.7015948343868881, 0.1798622772687748, 0.3996371806942772],
 [0.24650230825816344, 0.2760869516899871, 0.23709706586921275],
 [0.408464720478357, 0.8613100052209202, 0.4145548431967745],
 [0.42428768955149687, 0.24588191945836724, 0.4588197836103255]]

In [18]:
import numpy as np
import ast

data_list = ast.literal_eval(response)
poses = np.array(data_list)
poses

array([[-0.15,  0.5 ,  1.57],
       [ 0.2 ,  0.55,  0.52],
       [ 0.35,  0.45,  0.05],
       [ 0.1 ,  0.75,  2.35],
       [ 0.5 ,  0.2 ,  0.65],
       [ 0.25,  0.3 ,  1.45],
       [ 0.7 ,  0.15,  0.4 ],
       [ 0.2 ,  0.25,  0.25],
       [ 0.4 ,  0.85,  1.85],
       [ 0.4 ,  0.25,  0.45]])

  from .autonotebook import tqdm as notebook_tqdm
