In [3]:
from datasets import Dataset, DatasetDict, load_dataset
from typing import Dict, Any
from vllm import LLM
from vllm.sampling_params import SamplingParams

In [8]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
dataset_name = "lordspline/arc-agi"
dataset = load_dataset("lordspline/arc-agi")

print(dataset['training'][0])

{'train': [{'input': [[0, 0, 5], [0, 5, 0], [5, 0, 0]], 'output': [[3, 3, 3], [4, 4, 4], [2, 2, 2]]}, {'input': [[0, 0, 5], [0, 0, 5], [0, 0, 5]], 'output': [[3, 3, 3], [3, 3, 3], [3, 3, 3]]}, {'input': [[5, 0, 0], [0, 5, 0], [5, 0, 0]], 'output': [[2, 2, 2], [4, 4, 4], [2, 2, 2]]}, {'input': [[0, 5, 0], [0, 0, 5], [0, 5, 0]], 'output': [[4, 4, 4], [3, 3, 3], [4, 4, 4]]}], 'test': [{'input': [[0, 0, 5], [5, 0, 0], [0, 5, 0]], 'output': [[3, 3, 3], [2, 2, 2], [4, 4, 4]]}]}


In [5]:
def format_grid(grid_input):
    # If input is already a list, use it directly
    if isinstance(grid_input, list):
        grid = grid_input
    else:
        # Convert string representation to list if needed
        import ast
        grid = ast.literal_eval(grid_input)
    
    # Format each row with spaces between numbers
    formatted_rows = []
    for row in grid:
        formatted_row = ' '.join(str(num) for num in row)
        formatted_rows.append(formatted_row)
    
    # Join rows with newlines
    return '\n'.join(formatted_rows)


def format_prompt(example):
    # Start with empty prompt
    prompt = "Find the common rule that maps an input grid to an output grid given the examples below.\n\n"
    
    # Add each training example as context
    for idx, train_example in enumerate(example['train']):
        prompt += f"Example {idx + 1}:\n\n"
        prompt += f"Input:\n{format_grid(train_example['input'])}\n"  # Assuming input is already in list format
        prompt += f"Output:\n{format_grid(train_example['output'])}\n\n"
    
    # Add the test question
    prompt += "\nBelow is a test input grid. Predict the corresponding output grid by applying the rule you found. Your final answer should just be the text output grid itself.\n\n"
    prompt += f"Input:\n{format_grid(example['test'][0]['input'])}\n"
    prompt += "Output:"
    
    return prompt

# Example usage:
# Assuming dataset is loaded
sample = dataset['training'][0]  # Get first example
formatted_prompt = format_prompt(sample)
print(formatted_prompt)

Find the common rule that maps an input grid to an output grid given the examples below.

Example 1:

Input:
0 0 5
0 5 0
5 0 0
Output:
3 3 3
4 4 4
2 2 2

Example 2:

Input:
0 0 5
0 0 5
0 0 5
Output:
3 3 3
3 3 3
3 3 3

Example 3:

Input:
5 0 0
0 5 0
5 0 0
Output:
2 2 2
4 4 4
2 2 2

Example 4:

Input:
0 5 0
0 0 5
0 5 0
Output:
4 4 4
3 3 3
4 4 4


Below is a test input grid. Predict the corresponding output grid by applying the rule you found. Your final answer should just be the text output grid itself.

Input:
0 0 5
5 0 0
0 5 0
Output:


# Self adapting dataset to clean up arg-agi

In [6]:
class SelfAdaptingDataset:
    def __init__(self, dataset):
        self.original_dataset = dataset
        self.formatted_dataset = self._transform_dataset()
    
    def _transform_single_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
        """Transform a single example from train/test format to prompt/completion format."""
        prompt = f'<|user|>{format_prompt(example)}<|user|><|assistant|><think>\n'
        completion = format_grid(example['test'][0]['output'])
        
        return {
            'prompt': prompt,
            'completion': completion,
            'learned': '',  # Empty placeholder for learned field
            'mistake': '',  # Empty placeholder for mistake field
            'best_completion': '',  # Empty placeholder for best completion field
            'original_data': example  # Keep original data for reference if needed
        }
    
    def _transform_dataset(self) -> DatasetDict:
        """Transform the entire dataset."""
        
        def transform_split(split_dataset):
            transformed_data = {
                'prompt': [],
                'completion': [],
                'learned': [],
                'mistake': [],
                'best_completion': [],
                'original_data': []
            }
            
            for example in split_dataset:
                transformed = self._transform_single_example(example)
                for key in transformed_data:
                    transformed_data[key].append(transformed[key])
            
            return Dataset.from_dict(transformed_data)
        
        # Transform each split
        transformed_dataset = DatasetDict({
            'training': transform_split(self.original_dataset['training']),
            'evaluation': transform_split(self.original_dataset['evaluation'])
        })
        
        return transformed_dataset
    
    def get_dataset(self) -> DatasetDict:
        """Get the transformed dataset."""
        return self.formatted_dataset
    
    def update_example(self, index: int, split: str, 
                      learned: str = None, 
                      mistake: str = None, 
                      best_completion: str = None):
        """
        Update the learned/mistake/best_completion fields for a specific example.
        
        Args:
            index: Index of the example to update
            split: 'training' or 'evaluation'
            learned: What the model learned from this example
            mistake: What mistake was made
            best_completion: The best completion found so far
        """
        if learned is not None:
            self.formatted_dataset[split] = self.formatted_dataset[split].map(
                lambda x, i: {'learned': learned} if i == index else {'learned': x['learned']},
                with_indices=True
            )
        
        if mistake is not None:
            self.formatted_dataset[split] = self.formatted_dataset[split].map(
                lambda x, i: {'mistake': mistake} if i == index else {'mistake': x['mistake']},
                with_indices=True
            )
        
        if best_completion is not None:
            self.formatted_dataset[split] = self.formatted_dataset[split].map(
                lambda x, i: {'best_completion': best_completion} if i == index else {'best_completion': x['best_completion']},
                with_indices=True
            )

In [7]:
# Example usage:
dataset = load_dataset("lordspline/arc-agi")
adapted_dataset = SelfAdaptingDataset(dataset)
transformed_dataset = adapted_dataset.get_dataset()

# Print first example
print(transformed_dataset['training'][0])

{'prompt': 'Find the common rule that maps an input grid to an output grid given the examples below.\n\nExample 1:\n\nInput:\n0 0 5\n0 5 0\n5 0 0\nOutput:\n3 3 3\n4 4 4\n2 2 2\n\nExample 2:\n\nInput:\n0 0 5\n0 0 5\n0 0 5\nOutput:\n3 3 3\n3 3 3\n3 3 3\n\nExample 3:\n\nInput:\n5 0 0\n0 5 0\n5 0 0\nOutput:\n2 2 2\n4 4 4\n2 2 2\n\nExample 4:\n\nInput:\n0 5 0\n0 0 5\n0 5 0\nOutput:\n4 4 4\n3 3 3\n4 4 4\n\n\nBelow is a test input grid. Predict the corresponding output grid by applying the rule you found. Your final answer should just be the text output grid itself.\n\nInput:\n0 0 5\n5 0 0\n0 5 0\nOutput:', 'completion': '3 3 3\n2 2 2\n4 4 4', 'learned': '', 'mistake': '', 'best_completion': '', 'original_data': {'test': [{'input': [[0, 0, 5], [5, 0, 0], [0, 5, 0]], 'output': [[3, 3, 3], [2, 2, 2], [4, 4, 4]]}], 'train': [{'input': [[0, 0, 5], [0, 5, 0], [5, 0, 0]], 'output': [[3, 3, 3], [4, 4, 4], [2, 2, 2]]}, {'input': [[0, 0, 5], [0, 0, 5], [0, 0, 5]], 'output': [[3, 3, 3], [3, 3, 3], [3, 

In [20]:
def parse_grid(text: str) -> list[list[int]]:
    """
    Parses a 2D grid of integers from a given text string.
    Any line that can be split purely into integers is assumed to be part of the grid.
    Lines that contain text or cannot be parsed as integers are ignored.
    
    :param text: A string containing arbitrary text and lines with integers.
    :return: A list of lists of integers representing the parsed grid.
    """
    main_grid = []
    grid = []
    
    for line in text.splitlines():
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Attempt to parse the entire line as a row of integers
        tokens = line.split()
        try:
            print("Trying: ", tokens)
            row = [int(token) for token in tokens]
            grid.append(row)
            print("Added row!", row)
            print("GRID IS NOW: ", grid)
        except ValueError:
            if grid != []:
                main_grid.append(grid)
                grid = []
            continue

    if grid != []:
        main_grid.append(grid)
    
    return main_grid

In [21]:
string = f'''


Now, let's apply the identified rule to predict the output grid based on the input. The rule involves scaling certain rows based on their first element's value.

1. Check each row in the input to see if the first element is a number greater than 1.
2. If the first element is 5, scale the row by multiplying by 5.
3. Repeat this for rows where the first element is 3, scaling them by 3.
4. For rows starting with 4, scale them by 4.

Here's the step-by-step process:

Starting with the input grid:

1 0 0 0 0 0 0 0 0 2
1 0 0 0 0 3 0 0 3 4
...
5 0 0 0 0 4
...
7 7 7 7 7 7 7 7 7 7

Modify rows where the first element is 5, 3, or 4 by scaling:

- First row remains the same: 1 0 0 0 0 0 0 0 0 2
- Second row has a 1 (not scaled), so remains as originally
- Third row has a 1, so remains as originally
- Continue this process across all rows.

The modified output grid will mirror this rule, with rows scaled by 5, 3, or 4 based on their first elements.

Here is the predicted output grid:

1 0 0 0 0 0 0 0 0 2
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 2
1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 2
'''

grid = [[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 5, 8, 0, 8, 0, 0, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 5, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 5, 8, 0, 8, 0, 0, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 5, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 5, 8, 0, 8, 0, 0, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 5, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 5, 8, 0, 8, 0, 0, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 8, 8, 8, 8, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 5, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], [0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0], [0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]

print(len(grid))

print(parse_grid(string))

1
Trying:  ['Now,', "let's", 'apply', 'the', 'identified', 'rule', 'to', 'predict', 'the', 'output', 'grid', 'based', 'on', 'the', 'input.', 'The', 'rule', 'involves', 'scaling', 'certain', 'rows', 'based', 'on', 'their', 'first', "element's", 'value.']
Trying:  ['1.', 'Check', 'each', 'row', 'in', 'the', 'input', 'to', 'see', 'if', 'the', 'first', 'element', 'is', 'a', 'number', 'greater', 'than', '1.']
Trying:  ['2.', 'If', 'the', 'first', 'element', 'is', '5,', 'scale', 'the', 'row', 'by', 'multiplying', 'by', '5.']
Trying:  ['3.', 'Repeat', 'this', 'for', 'rows', 'where', 'the', 'first', 'element', 'is', '3,', 'scaling', 'them', 'by', '3.']
Trying:  ['4.', 'For', 'rows', 'starting', 'with', '4,', 'scale', 'them', 'by', '4.']
Trying:  ["Here's", 'the', 'step-by-step', 'process:']
Trying:  ['Starting', 'with', 'the', 'input', 'grid:']
Trying:  ['1', '0', '0', '0', '0', '0', '0', '0', '0', '2']
Added row! [1, 0, 0, 0, 0, 0, 0, 0, 0, 2]
GRID IS NOW:  [[1, 0, 0, 0, 0, 0, 0, 0, 0, 2]]
Tr

In [6]:
import re
import numpy as np
from typing import Optional, Union, List
from vllm.sampling_params import SamplingParams
import json
from datetime import datetime
from pathlib import Path
from filelock import FileLock



class ThinkingCompletionRewardFunction:
    def __init__(self, log_dir: str = "logs"):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(exist_ok=True)
        self.log_file = self.log_dir / f"rewards_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
        self.file_lock = FileLock(str(self.log_file) + ".lock")
        

    def log_entry(self, prompt: str, thoughts: str, answer: str, expected: str, reward: float):
        """
        Thread-safe logging of reward computation entries to a JSONL file.
        
        Args:
            prompt: The input prompt
            thoughts: The model's thinking process
            answer: The model's answer
            expected: The expected completion
            reward: The computed reward
        """
        entry = {
            "timestamp": datetime.now().isoformat(),
            "prompt": prompt,
            "thoughts": thoughts,
            "answer": answer,
            "expected": expected,
            "reward": reward
        }

        # Use file lock to ensure thread-safe writing
        with self.file_lock:
            with open(self.log_file, "a") as f:
                json.dump(entry, f)
                f.write("\n")

    def parse_grid(self, text: str) -> list[list[int]]:
        """
        Parses a 2D grid of integers from a given text string.
        Any line that can be split purely into integers is assumed to be part of the grid.
        Lines that contain text or cannot be parsed as integers are ignored.
        
        :param text: A string containing arbitrary text and lines with integers.
        :return: A list of lists of integers representing the parsed grid.
        """
        main_grid = []
        grid = []
        
        for line in text.splitlines():
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Attempt to parse the entire line as a row of integers
            tokens = line.split()
            try:
                row = [int(token) for token in tokens]
                grid.append(row)
            except ValueError:
                if grid != []:
                    main_grid.append(grid)
                    grid = []
                continue
        
        return main_grid

    def _compute_reward(self, generated: str, expected: str) -> float:
        """
        Compute reward based on the following criteria:
        - Multiple grids detected: -0.25 penalty
        - Dimension match: +0.5 points
        - Full match: 2.0 points total
        - Partial match: Scaled between 0 and 2.0 based on correctness
        """
        # Extract arrays from both strings
        all_grids = self.parse_grid(generated)
        exp_array = expected
        
        # If no grids found or expected array is None, return 0
        if not all_grids or exp_array is None:
            return -0.25
        
        # Get the last grid and apply penalty for multiple grids
        gen_array = all_grids[-1]
        reward = 0.0 if len(all_grids) == 1 else -0.1
        
        try:
            gen_np = np.array(gen_array)
            exp_np = np.array(exp_array)
            
            # Check dimensions
            if gen_np.shape == exp_np.shape:
                reward += 0.5
                
                # Calculate element-wise match percentage
                total_elements = np.prod(gen_np.shape)
                matching_elements = np.sum(gen_np == exp_np)
                match_percentage = matching_elements / total_elements
                
                if match_percentage == 1.0:
                    # Perfect match (including the 0.5 from dimension match)
                    reward = 2.0
                else:
                    # Scale remaining 1.5 points based on match percentage
                    reward += 1.5 * match_percentage
                    
        except:
            return reward  # Return whatever reward we've accumulated so far
        
        return reward

    def calculate_reward(self, prompts: List[str], completions: List[str], **kwargs) -> List[float]:
        # Generate final completions using the thinking
        answers = kwargs.get("answers", [])
        expected_completions = kwargs.get("expected_completions", [])
        
        # Compute rewards
        rewards = []
        for prompt, thoughts, answer, expected in zip(prompts, completions, answers, expected_completions):
            reward = self._compute_reward(answer, expected)
            rewards.append(reward)

            # Log the entry
            # self.log_entry(
            #     prompt=prompt,
            #     thoughts=thoughts,
            #     answer=answer,
            #     expected=expected,
            #     reward=reward
            # )
        return rewards

INFO 02-17 20:19:09 __init__.py:190] Automatically detected platform cuda.


In [7]:
print(ThinkingCompletionRewardFunction()._compute_reward(string, grid[0][0]))

0.0
