In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

DEVICE = 'cuda:3'


# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="float16", device_map=DEVICE
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

processor.patch_size = model.config.vision_config.patch_size
processor.num_additional_image_tokens = getattr(model.config.vision_config, "num_additional_image_tokens", 0)
# processor.vision_feature_select_strategy = model.config.vision_config.vision_feature_select_strategy

  from .autonotebook import tqdm as notebook_tqdm


Loading checkpoint shards: 100%|██████████| 5/5 [00:34<00:00,  6.86s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [2]:
import torch
from moviepy import ImageSequenceClip
from IPython.display import HTML
from base64 import b64encode
import tempfile
import os

def tensors_to_video_jupyter(tensors: list[torch.Tensor], fps: int = 30):
    frames = [tensor.squeeze(0).cpu().numpy().astype('uint8') for tensor in tensors]

    clip = ImageSequenceClip(frames, fps=fps)

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
        temp_filename = tmpfile.name
        clip.write_videofile(temp_filename, codec="libx264", audio=False)

    with open(temp_filename, "rb") as f:
        video_data = f.read()
    os.remove(temp_filename)

    encoded = b64encode(video_data).decode('utf-8')
    video_html = f'''
    <video width="640" height="480" controls>
        <source src="data:video/mp4;base64,{encoded}" type="video/mp4">
        Your browser does not support the video tag.
    </video>
    '''
    return HTML(video_html)

In [3]:
import torch.nn as nn
import numpy as np
from torch.distributions.normal import Normal

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class NatureCNN(nn.Module):
    def __init__(self, sample_obs):
        super().__init__()

        extractors = {}

        self.out_features = 0
        feature_size = 256
        in_channels=sample_obs["rgb"].shape[-1]
        image_size=(sample_obs["rgb"].shape[1], sample_obs["rgb"].shape[2])


        # here we use a NatureCNN architecture to process images, but any architecture is permissble here
        cnn = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=32,
                kernel_size=8,
                stride=4,
                padding=0,
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=0
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=0
            ),
            nn.ReLU(),
            nn.Flatten(),
        )

        # to easily figure out the dimensions after flattening, we pass a test tensor
        with torch.no_grad():
            n_flatten = cnn(sample_obs["rgb"].float().permute(0,3,1,2).cpu()).shape[1]
            fc = nn.Sequential(nn.Linear(n_flatten, feature_size), nn.ReLU())
        extractors["rgb"] = nn.Sequential(cnn, fc)
        self.out_features += feature_size

        if "state" in sample_obs:
            # for state data we simply pass it through a single linear layer
            state_size = sample_obs["state"].shape[-1]
            extractors["state"] = nn.Linear(state_size, 256)
            self.out_features += 256

        self.extractors = nn.ModuleDict(extractors)

    def forward(self, observations) -> torch.Tensor:
        # import ipdb; ipdb.set_trace()
        encoded_tensor_list = []
        # self.extractors contain nn.Modules that do all the processing.
        for key, extractor in self.extractors.items():
            obs = observations[key]
            if key == "rgb":
                obs = obs.float().permute(0,3,1,2)
                obs = obs / 255
            encoded_tensor_list.append(extractor(obs))
        return torch.cat(encoded_tensor_list, dim=1)

class PPO_rgb(nn.Module):
    def __init__(self, envs, sample_obs):
        super().__init__()
        self.feature_net = NatureCNN(sample_obs=sample_obs)
        # latent_size = np.array(envs.unwrapped.single_observation_space.shape).prod()
        latent_size = self.feature_net.out_features
        self.critic = nn.Sequential(
            layer_init(nn.Linear(latent_size, 512)),
            nn.ReLU(inplace=True),
            layer_init(nn.Linear(512, 1)),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(latent_size, 512)),
            nn.ReLU(inplace=True),
            layer_init(nn.Linear(512, np.prod(envs.unwrapped.single_action_space.shape)), std=0.01*np.sqrt(2)),
        )
        self.actor_logstd = nn.Parameter(torch.ones(1, np.prod(envs.unwrapped.single_action_space.shape)) * -0.5)
    def get_features(self, x):
        return self.feature_net(x)
    def get_value(self, x):
        x = self.feature_net(x)
        return self.critic(x)
    def get_action(self, x, deterministic=False):
        x = self.feature_net(x)
        action_mean = self.actor_mean(x)
        if deterministic:
            return action_mean
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        return probs.sample()
    def get_action_and_value(self, x, action=None):
        # import ipdb; ipdb.set_trace()
        x = self.feature_net(x)
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)
    

class PPO(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 1)),
        )
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, np.prod(envs.single_action_space.shape)), std=0.01*np.sqrt(2)),
        )
        self.actor_logstd = nn.Parameter(torch.ones(1, np.prod(envs.single_action_space.shape)) * -0.5)

    def get_value(self, x):
        return self.critic(x)
    def get_action(self, x, deterministic=False):
        action_mean = self.actor_mean(x)
        if deterministic:
            return action_mean
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        return probs.sample()
    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

In [None]:
from gymnasium import spaces
import gymnasium as gym
from qwen_vl_utils import process_vision_info
import mani_skill


sensor_configs = {"width": 512, "height": 512}
task = 'LiftPegUpright'
env = gym.make(
  f"{task}",
  obs_mode="rgb",
  num_envs=1, # if num_envs > 1, GPU simulation backend is used.
  sensor_configs = sensor_configs,
  render_mode = 'rgb_array',

)
partial_reset = True
from mani_skill.utils.wrappers.flatten import FlattenActionSpaceWrapper, FlattenRGBDObservationWrapper
from mani_skill.vector.wrappers.gymnasium import ManiSkillVectorEnv
env = FlattenRGBDObservationWrapper(env, rgb=True, depth=False, state=True)
if isinstance(env.action_space, gym.spaces.Dict):
    env = FlattenActionSpaceWrapper(env)
env = ManiSkillVectorEnv(env, 1, ignore_terminations=not partial_reset, record_metrics=True)

# instruction = env.unwrapped.get_language_instruction()
# print("instruction:", instruction[0])

prompts_qwen = [f'In the video, the robotic arm executes a task ({task}), was it successful?',
           f'From the video of the robotic arm failing during the task ({task}), provide high-level corrective commands to guide it to recover and finish the task. (Your answer should be a detailed textual suggestion of about two to three sentences.)',
           f'This is a video of a robotic arm performing a task ({task}), an error occurred during execution. Please provide low-level corrective commands to help the robot recover and complete the task successfully. (Your answer should describe which direction (relative to the robot arm) and how much the robot arm should move to recover from the failure.)")',
           f'This is a video of a robotic arm performing a task ({task}), please explain in detail the reason for the task failure. (Your answer should be a detailed textual description of about two to three sentences.)'
           ]
prompts = ['In the video, the robotic arm executes a task, was it successful?',
           'From the video of the robotic arm failing during the task, provide high-level corrective commands to guide it to recover and finish the task.',
           'This is a video of a robotic arm performing a task, an error occurred during execution. Please provide low-level corrective commands to help the robot recover and complete the task successfully.',
           'This is a video of a robotic arm performing a task, please explain in detail the reason for the task failure.'
           ]

# env = ManiSkillCameraWrapper(env)
obs, _ = env.reset(seed=42)
# from ..ManiSkill.examples.baselines.ppo.ppo_rgb import Agent 
agent = PPO_rgb(env, sample_obs=obs)
# agent = PPO(env)
agent = agent.to(DEVICE)
# agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/StackCube-v1__ppo_rgb__1__1752076784/ckpt_151.pt'))
# agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/PegInsertionSide-v1__ppo__1__1752050019/ckpt_1226.pt'))
# agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/PegInsertionSide-v1__ppo_rgb__1__1752039752/ckpt_601.pt'))
# agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/LiftPegUpright__ppo_rgb__1__1751971812/final_ckpt.pt'))
agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/LiftPegUpright__ppo_rgb__1__1751971812/ckpt_601.pt'))
# agent.load_state_dict(torch.load('/home/user10/ManiSkill/examples/baselines/ppo/runs/LiftPegUpright-v1__ppo_rgb__1__1751570673/final_ckpt.pt'))
# agent.load_state_dict(torch.load('/home/user10/ManiSkill/examples/baselines/ppo/runs/StackCube-v1__ppo_rgb__1__1751570406/final_ckpt.pt'))
# agent.load_state_dict(torch.load('/home/user10_3/ManiSkill/examples/baselines/ppo/runs/PickCube-v1__ppo_rgb__1__1752068048/ckpt_401.pt'))

# returns language instruction for each parallel env
# instruction = ['Pull the cube to the red and white target.']#env.unwrapped.get_language_instruction()
# print("instruction:", instruction[0])

agent.eval()
use_vlm = True
obs_list = []
steps = 40
img = None
for i in range(steps):
    # obs_list.append(obs['rgb'].clone())
    
    img = env.render()
    obs_list.append(img.clone())
    # action = env.action_space.sample() # replace this with your policy inference
    obs = {k: v.to(DEVICE) for k, v in obs.items()}
    action, _, _, _ = agent.get_action_and_value(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    
    if truncated.any() or terminated.any():
        break
    # import matplotlib.pyplot as plt
    # plt.imshow(obs['sensor_data']['base_camera']['rgb'][0].cpu().numpy())
    # import time; time.sleep(2)

from torchvision.transforms.functional import to_pil_image

# Преобразуем obs_list (тензоры) → PIL изображения
obs_list_pil = []
for img in obs_list:
    img_np = img[0].cpu()  # [512, 512, 3]
    if img_np.max() > 1.0:
        img_np = img_np / 255.0
    img_pil = to_pil_image(img_np.permute(2, 0, 1))  # → (3, H, W)
    obs_list_pil.append(img_pil)

outputs = []
if use_vlm:
    for prompt in prompts_qwen:
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": obs_list_pil,
                        
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
        fps = video_kwargs.get("fps", 1.0)  
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            # fps=1.0,
            padding=True,
            return_tensors="pt",
            **video_kwargs,
        )
        inputs = inputs.to(DEVICE)
        obs_list
        output = processor.batch_decode(model.generate(**inputs, max_new_tokens=2048), skip_special_tokens=True)[0]
        print(f'{output}\n')
        outputs.append(output)
print("Episode Info", info)

  logger.warn(


system
You are a helpful assistant.
user
In the video, the robotic arm executes a task (LiftPegUpright), was it successful?
assistant
The robotic arm in the video appears to be attempting the "LiftPegUpright" task, which typically involves lifting a peg or object and placing it upright. Based on the sequence of images provided:

1. The robotic arm approaches the blue and red block.
2. The arm makes contact with the block.
3. The arm lifts the block slightly off the surface.
4. The arm then attempts to rotate the block upright.

However, the final image shows the block still lying flat on the surface, indicating that the task was not completed successfully. The block has not been lifted or rotated upright as intended by the "LiftPegUpright" task.

system
You are a helpful assistant.
user
From the video of the robotic arm failing during the task (LiftPegUpright), provide high-level corrective commands to guide it to recover and finish the task. (Your answer should be a detailed textual s

In [25]:
print(outputs[2])

system
You are a helpful assistant.
user
This is a video of a robotic arm performing a task (LiftPegUpright), an error occurred during execution. Please provide low-level corrective commands to help the robot recover and complete the task successfully. (Your answer should describe which direction (relative to the robot arm) and how much the robot arm should move to recover from the failure.)")
assistant
The robotic arm appears to have encountered an issue where it failed to lift the peg properly, possibly due to the peg not being securely grasped or the arm's position not being optimal for lifting. Here are some low-level corrective commands to help the robot recover and complete the task successfully:

1. **Adjust the Position of the Gripper:**
   - **Move the Gripper Closer to the Peg:** If the gripper is too far away, move it closer to the peg by slightly extending the arm in the direction of the peg.
   - **Rotate the Gripper:** Ensure that the gripper is oriented correctly with re

In [20]:
tensors_to_video_jupyter(obs_list, fps = 5)

                                                                      
[A                                                                   

[A[A                                                                


frame_index:  42%|████▎     | 17/40 [00:15<00:21,  1.08it/s, now=None]
[A

                                                                      
[A                                                                   

[A[A                                                                


frame_index:  42%|████▎     | 17/40 [00:15<00:21,  1.08it/s, now=None]
[A

[A[A

MoviePy - Building video /tmp/tmpmc05oymk.mp4.
MoviePy - Writing video /tmp/tmpmc05oymk.mp4






[A[A[A




                                                                      
[A                                                                   

[A[A                                                                


frame_index:  42%|████▎     | 17/40 [00:15<00:21,  1.07it/s, now=None]
[A

                                                                      
[A                                                                   

[A[A                                                                


frame_index:  42%|████▎     | 17/40 [00:15<00:21,  1.07it/s, now=None]
[A

[A[A

MoviePy - Done !
MoviePy - video ready /tmp/tmpmc05oymk.mp4
