In [1]:
from datasets import load_dataset


from r1_gamer.sandbox.games import Game2048, GameBreakout, GameMatch
from r1_gamer.sandbox.MetaSandbox import SandboxMananger, GameSandbox, ExperienceQueue

import datasets
from datasets import Dataset, Features, Value, Image


game = Game2048()

match_game = GameMatch()



sandbox_2048_1 = GameSandbox(
    make_env_func = game.gym_env_func,
    game_prompt = game.game_prompt,
    num_envs = 50,
    num_actions = game.num_actions,
    parse_action_func = game.parse_action,
    text_observation_func = game.get_text_observation,
    step_per_action = 1,
    episode_max_steps = 1000,
    screen_size = (640, 840)
)

sandbox_2048_2 = GameSandbox(
    make_env_func = match_game.gym_env_func,
    game_prompt = match_game.game_prompt,
    num_envs = 50,
    num_actions = match_game.num_actions,
    parse_action_func = match_game.parse_action,
    text_observation_func = match_game.get_text_observation,
    step_per_action = 1,
    episode_max_steps = 1000,
    screen_size = (640, 840),
)
# 如果 step_per_action不等于1，reward 会消失

META_SANDBOX = SandboxMananger(sandbox_list=[sandbox_2048_1, sandbox_2048_2])

def make_huggingface_dataset_from_image_prompt_batch(images, prompts, perception=None):
    # the dataset has two columns: problem and images. The problem is a string, add "<image>" to the front. The images column is a list of PIL images.

    features = Features(
        {'problem': Value(dtype='string', id=None),
         'answer': Value(dtype='string', id=None),
         'perception': Value(dtype='string', id=None),
         'images': datasets.Sequence(Image())})
    
    # Prepare data
    data = {
        'problem': ["<image>"+prompt for prompt in prompts],
        'answer': ["0" for _ in prompts],
        'perception': ["<perception>\n"+i+"\n</perception>" for i in perception] if perception is not None else ["" for _ in prompts],
        'images': [[image] for image in images],

    }
    
    # Create and return the dataset
    return Dataset.from_dict(data, features=features)

[94mPrompt Template: You are now playing the 2048 game. 2048 is a sliding tile puzzle game where you combine numbered tiles to create a tile with the value 2048.

Rule:
Only Tiles with the SAME number merge when they collide. After each move, a new tile (2 or 4) appears randomly on the board. The game ends when there are no more valid moves.

Available actions:
- (0): Up (slide all tiles upward)
- (1): Right (slide all tiles to the right)
- (2): Down (slide all tiles downward)
- (3): Left (slide all tiles to the left)
What action should you take to achieve the highest score and reach the 2048 tile?

First describe the board in <perception></perception>. Then output your thinking process (analyze the outcome of each action and choose the best one) in <think></think> and final action number in <answer></answer>.[0m
[96mNumber of Envs: 50[0m
[94mPrompt Template: You are playing a 'Shisen-sho' puzzle game.
The goal is to match pairs of identical tiles by connecting them with a path th

In [2]:
observation = META_SANDBOX.get_text_observation_batch()

In [3]:
print(observation[-1])

 Green triangle | Blue triangle | Blue square | Red triangle | Red square | Blue circle 
 Green square | Green triangle | Blue circle | Green circle | Blue triangle | Green circle 
 Green circle | Red triangle | Green square | Red triangle | Green square | Green triangle 
 Blue square | Green triangle | Blue triangle | Red circle | Green circle | Red square 
 Blue triangle | Green square | Blue circle | Red circle | Blue square | Blue circle 
 Red triangle | Red square | Red square | Red circle | Blue square | Red circle 


In [4]:
images, prompts = META_SANDBOX.get_image_prompt_batch()

In [5]:
train_dataset = make_huggingface_dataset_from_image_prompt_batch(images, prompts,observation)
test_dataset = make_huggingface_dataset_from_image_prompt_batch(images, prompts,observation)

In [6]:
train_dataset[-1]

{'problem': "<image>You are playing a 'Shisen-sho' puzzle game.\nThe goal is to match pairs of identical tiles by connecting them with a path that has at most 2 turns and doesn't cross any other tiles.\nThe tiles are distinguished by their color and shape:\n- Red, Green, Blue, Yellow, Magenta, Cyan, etc.\n- Shapes include: circle, square, triangle, diamond, cross, star, etc.\nPlease analyze the game board and identify two matching tiles that can be connected according to these rules.\nReturn your answer as follows:\n1. First coordinate: (row1, col1)\n2. Second coordinate: (row2, col2)\nWhere row and col are 0-indexed numbers such as (0, 1), starting from the top-left of the board.\nFirst describe the board in <perception></perception>. Then output your thinking process in <think></think> and final action in <answer>(row1, col1) (row2, col2)</answer>.",
 'answer': '0',
 'perception': '<perception>\n Green triangle | Blue triangle | Blue square | Red triangle | Red square | Blue circle \

In [4]:
dataset = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [30]:
dataset.push_to_hub("leonardPKU/test_2048",token="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")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/leonardPKU/test_2048/commit/f757afcf2583ba0b9d9a635ee6cd322459c3d5c4', commit_message='Upload dataset', commit_description='', oid='f757afcf2583ba0b9d9a635ee6cd322459c3d5c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/leonardPKU/test_2048', endpoint='https://huggingface.co', repo_type='dataset', repo_id='leonardPKU/test_2048'), pr_revision=None, pr_num=None)

In [7]:
from verl.utils.dataset import RLHFDataset, collate_fn

In [8]:
from transformers import AutoTokenizer,AutoProcessor

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


a = RLHFDataset(
    data_path = train_dataset,
    tokenizer=tokenizer,
    processor=processor,
    prompt_key="problem",
    answer_key="answer",
    image_key="images",
    max_prompt_length=1024,
    truncation="error",
    system_prompt="You are a helpful assistant.",
    max_pixels=4194304,
    min_pixels=262144,
    perception_key="perception",
    use_groundtrurh_perception=True
)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
a[0]

{'problem': '<image>You are now playing the 2048 game. 2048 is a sliding tile puzzle game where you combine numbered tiles to create a tile with the value 2048.\n\nRule:\nOnly Tiles with the SAME number merge when they collide. After each move, a new tile (2 or 4) appears randomly on the board. The game ends when there are no more valid moves.\n\nAvailable actions:\n- (0): Up (slide all tiles upward)\n- (1): Right (slide all tiles to the right)\n- (2): Down (slide all tiles downward)\n- (3): Left (slide all tiles to the left)\nWhat action should you take to achieve the highest score and reach the 2048 tile?\n\nFirst describe the board in <perception></perception>. Then output your thinking process (analyze the outcome of each action and choose the best one) in <think></think> and final action number in <answer></answer>.',
 'perception': '<perception>\n - | - | - | - \n - | 2 | - | - \n - | - | - | - \n - | - | 2 | - \n</perception>',
 'multi_modal_data': {'image': [<PIL.PngImagePlugin

In [11]:
tokenizer.decode(a[0]["input_ids"],skip_special_tokens=True)

'system\nYou are a helpful assistant.\nuser\nYou are now playing the 2048 game. 2048 is a sliding tile puzzle game where you combine numbered tiles to create a tile with the value 2048.\n\nRule:\nOnly Tiles with the SAME number merge when they collide. After each move, a new tile (2 or 4) appears randomly on the board. The game ends when there are no more valid moves.\n\nAvailable actions:\n- (0): Up (slide all tiles upward)\n- (1): Right (slide all tiles to the right)\n- (2): Down (slide all tiles downward)\n- (3): Left (slide all tiles to the left)\nWhat action should you take to achieve the highest score and reach the 2048 tile?\n\nFirst describe the board in <perception></perception>. Then output your thinking process (analyze the outcome of each action and choose the best one) in <think></think> and final action number in <answer></answer>.\nassistant\n<perception>\n - | - | - | - \n - | 2 | - | - \n - | - | - | - \n - | - | 2 | - \n</perception>'

In [8]:
 # Create collated batch directly instead of using a dataloader
env_batch_dict = collate_fn([a [i] for i in range(10)])


In [9]:
env_batch_dict

{'input_ids': tensor([[151643, 151643, 151643,  ..., 151644,  77091,    198],
         [151643, 151643, 151643,  ..., 151644,  77091,    198],
         [151643, 151643, 151643,  ..., 151644,  77091,    198],
         ...,
         [151643, 151643, 151643,  ..., 151644,  77091,    198],
         [151643, 151643, 151643,  ..., 151644,  77091,    198],
         [151643, 151643, 151643,  ..., 151644,  77091,    198]]),
 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'position_ids': tensor([[[  0,   0,   0,  ..., 220, 221, 222],
          [  0,   0,   0,  ..., 220, 221, 222],
          [  0,   0,   0,  ..., 220, 221, 222]],
 
         [[  0,   0,   0,  ..., 220, 221, 222],
          [  0,   0,   0,  ..., 220, 221, 222],
          [  0,   0,   0,  ..., 220, 221, 222]],
 
         [[  0,   0,   0,  

In [23]:
a[0]

{'problem': '<image>In the given diagram, if angle 1 has a measure of 35.0 degrees, what is the measure of angle 2?',
 'multi_modal_data': {'image': [<PIL.Image.Image image mode=RGB size=878x298>]},
 'multi_modal_inputs': {'pixel_values': tensor([[1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
          [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
          [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
          ...,
          [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
          [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
          [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459]]),
  'image_grid_thw': tensor([[ 1, 22, 62]])},
 'input_ids': tensor([151643, 151643, 151643,  ..., 151644,  77091,    198]),
 'attention_mask': tensor([0, 0, 0,  ..., 1, 1, 1]),
 'position_ids': tensor([[ 0,  0,  0,  ..., 78, 79, 80],
         [ 0,  0,  0,  ..., 78, 79, 80],
         [ 0,  0,  0,  ..., 78, 79, 80]]),
 'raw_prompt_ids': [151644,
  894