In [1]:
import time, sys, os
import concurrent.futures
import numpy as np
import re
from datetime import datetime
from typing import Optional, List, Dict, Any, Tuple
from openai import OpenAI


class OnlineLM:
    """Online language model using API services."""
    
    def __init__(self, model_name: str, **kwargs):
        self.model = model_name  # Store model name as the model identifier
        self.temperature = kwargs.get("temperature", 0.7)  # Default temperature
        self._initialize()
        
    def _initialize(self):
        """Initialize the OpenAI client."""
        if "DEEPINFRA_TOKEN" not in os.environ:
            raise ValueError("DEEPINFRA_TOKEN environment variable is not set")
            
        self.openai = OpenAI(
            api_key=os.environ["DEEPINFRA_TOKEN"],
            base_url="https://api.deepinfra.com/v1/openai",
        )
    
    def _fetch_response(self, message_data):
        """Fetch a response from the API."""
        messages, max_tokens= message_data
        
        try:
            chat_completion = self.openai.chat.completions.create(
                model=self.model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=self.temperature if self.temperature > 0 else 0,
                # top_k=20 if self.temperature>0.0 else None,
                # top_p=0.8 if self.temperature>0.0 else None,                
            )
            content = chat_completion.choices[0].message.content
            
            return content
        except Exception as e:
            return f"Error: {str(e)}", None
    
    def generate(self, 
                input_messages: List[Dict[str, Any]], 
                max_new_tokens: int = 100, 
                repeat_input: bool = False) -> Tuple[List[Any], Any]:
        """Generate text using the API."""
        
        # Prepare batch of requests
        request_data = []
        for messages in input_messages:
            request_data.append((messages, max_new_tokens))
        
        # Process in parallel
        results = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            api_results = list(executor.map(self._fetch_response, request_data))
            
            for i, content in enumerate(api_results):
                if repeat_input:
                    # Append generation to the original message
                    new_message = input_messages[i].copy()
                    new_message[-1] = new_message[-1].copy()
                    new_message[-1]['content'] += content
                    results.append(new_message)
                    # results.append((new_message, logits))
                else:
                    results.append(content)
                    # results.append((content, logits))
        
        # For API compatibility, return both results and a metadata object
        metadata = {"model": self.model, "online": True}
        time.sleep(0.02)
        return results, metadata

In [21]:
# Example usage
# model = OnlineLM("deepseek-ai/DeepSeek-R1")
model = OnlineLM("deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
from datasets import load_dataset

data = load_dataset(
    "MelinaLaimon/stream-of-search", 
    revision="7fb82650ab67458027afff115964bcd04d3d2075", 
    split='train').select(range(0, 1000))

data = data.map(lambda x: { # type: ignore
    'test_prompt': [
        # {'role': 'system', 'content': SYSTEM_PROMPT},
        x["messages_sos"]["role"=="user"]
    ],
    # 'answer': extract_hash_answer(x['answer'])
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
data

Dataset({
    features: ['nums', 'target', 'solution', 'rating', 'search_type', 'heuristic', 'messages_optimal', 'messages_sos_react', 'messages_sos', 'messages_deepseek', 'test_prompt'],
    num_rows: 1000
})

In [None]:
sys.path.append("~/projects/sos/stream-of-search") # TODO You may need to change this
from src.result_parsers.countdown_trajectories import evaluate_countdown_trajectory

results, metadata = model.generate(data['test_prompt'], max_new_tokens=4096)

# set data['completions'] to results
data = data.add_column("completion", results)
data = data.add_column("parsed_results", list(map(evaluate_countdown_trajectory, data)))
results = sum([d['solved'] for d in data['parsed_results']]) / len(data['parsed_results'])


In [66]:
for i in range(10):
    print(f"Q {i}\n\n")
    print(data['completion'][i])
    print(data['parsed_results'][i])

Q 0


<think>
Okay, so I have this problem where I need to combine the numbers 53, 57, 13, and 48 using only addition, subtraction, multiplication, or division. The goal is to get 64, and I have to use each number exactly once. Hmm, let me think about how to approach this.

First, I'll list out the numbers: 53, 57, 13, 48. I need to use each one once, so I can't repeat any. I'm a bit overwhelmed because there are so many possible combinations. Maybe I should start by looking for pairs that can be combined to get something useful.

Let me consider the largest numbers first. 57 is the biggest, followed by 53. If I subtract 53 from 57, that gives me 4. That's a small number, but maybe useful later. Alternatively, if I subtract 57 from 53, I get -4, which could also be useful, but I'm not sure yet.

Looking at the smaller numbers, 13 and 48. If I subtract 13 from 48, I get 35. Adding them gives 61, which is close to 64. Wait, 61 plus 3 would be 64, but I don't have a 3 yet. Alternatively, 

In [67]:
sum([d['solved'] for d in data['parsed_results']]) / len(data['parsed_results'])

0.6

In [68]:
data['parsed_results']

[{'initial_numbers': [53, 57, 13, 48],
  'remarks': 'Could not find SOLUTION declaration.',
  'solved': False,
  'target': 64},
 {'initial_numbers': [9, 62, 67, 5],
  'remarks': "Trajectory is valid with operations: ['67-62=5', '9-5=4', '5*4=20']",
  'solved': True,
  'target': 20},
 {'initial_numbers': [87, 80, 47, 2],
  'remarks': "Trajectory is valid with operations: ['47*2=94', '94-80=14', '87-14=73']",
  'solved': True,
  'target': 73},
 {'initial_numbers': [10, 30, 42, 5],
  'remarks': "Trajectory is valid with operations: ['30+5=35', '42-10=32', '35+32=67']",
  'solved': True,
  'target': 67},
 {'initial_numbers': [66, 71, 74, 49],
  'remarks': 'Could not find SOLUTION declaration.',
  'solved': False,
  'target': 77},
 {'initial_numbers': [68, 72, 26, 44],
  'remarks': "Trajectory is valid with operations: ['72-68=4', '44+26=70', '70+4=74']",
  'solved': True,
  'target': 74},
 {'initial_numbers': [58, 82, 92, 13],
  'remarks': "Trajectory is valid with operations: ['92-58=34',

In [16]:
import glob
for file in glob.glob("../results/qwen*/stream-of-search_*.json"):
    # rename to *.json
    os.rename(file, file.replace("stream-of-search_", ""))


In [17]:
from tqdm import tqdm
# from datasets import load_dataset
import re
import json, sys, os, glob
import numpy as np
from typing import List, Tuple
import math
import matplotlib.pyplot as plt
import re
import re
import ast
import tiktoken

def validate_operations(initial_numbers, operations, target):
    """
    Validates if the operations use all initial numbers exactly once and reach the target.
    """
    # Make a copy of initial numbers to track usage
    available_numbers = initial_numbers.copy()
    # Dictionary to store intermediate results
    intermediate_results = {}
    
    for i, operation in enumerate(operations):
        # Parse the operation (format like '9-5=4')
        parts = operation.split('=')
        if len(parts) != 2:
            return False
        
        result_str = parts[1].strip()
        expression = parts[0].strip()
        
        # Find operator (+, -, *, /)
        op_match = re.search(r'[+\-*/]', expression)
        if not op_match:
            return False
        
        operator = op_match.group(0)
        operands = expression.split(operator)
        if len(operands) != 2:
            return False
        
        left_operand = operands[0].strip()
        right_operand = operands[1].strip()
        
        # Try to get values for operands
        left_val = None
        if left_operand.isdigit() or (left_operand.startswith('-') and left_operand[1:].isdigit()):
            left_val = int(left_operand)
            if left_val in available_numbers:
                available_numbers.remove(left_val)
            elif left_val not in intermediate_results.values():
                return False  # Not an available number or intermediate result
        else:
            return False  # Non-numeric operand
        
        right_val = None
        if right_operand.isdigit() or (right_operand.startswith('-') and right_operand[1:].isdigit()):
            right_val = int(right_operand)
            if right_val in available_numbers:
                available_numbers.remove(right_val)
            elif right_val not in intermediate_results.values():
                return False  # Not an available number or intermediate result
        else:
            return False  # Non-numeric operand
        
        # Calculate the result
        calculated_result = None
        if operator == '+':
            calculated_result = left_val + right_val
        elif operator == '-':
            calculated_result = left_val - right_val
        elif operator == '*':
            calculated_result = left_val * right_val
        elif operator == '/':
            if right_val == 0:
                return False  # Division by zero
            calculated_result = left_val / right_val
        
        # Verify the result matches what's stated
        result_val = int(result_str) if result_str.isdigit() else None
        if result_val is None or abs(calculated_result - result_val) > 1e-10:
            return False
        
        # Store intermediate result for future operations
        intermediate_results[f"step{i}"] = result_val
    
    # Check if all initial numbers were used and final result matches target
    return len(available_numbers) == 0 and result_val == target

# def evaluate_countdown_trajectory(ds_entry):
#     """
#     Evaluates a countdown puzzle solver trajectory to determine if it correctly found a solution.
    
#     Args:
#         ds_entry (dict): Dictionary containing problem data and model response
    
#     Returns:
#         dict: A dictionary containing evaluation results
#     """
#     import re
#     import ast
    
#     # Extract the target and initial numbers from the entry
#     target = ds_entry.get('target')
#     initial_numbers = ds_entry.get('nums', [])
    
#     # Extract just the assistant's response
#     completion = ds_entry.get('completion', '')
#     parts = completion.split("\nassistant\n")
#     assistant_response = parts[-1] if len(parts) > 1 else completion
    
#     # Initialize default values
#     solved = False
#     operations = []
#     final_value = None
    
#     # First check for the explicit SOLUTION: YES/NO format, even without line breaks
#     solution_match = re.search(r"SOLUTION:\s*(YES|NO)", assistant_response, re.IGNORECASE)
#     if solution_match:
#         solved = solution_match.group(1).upper() == "YES"
        
#         # Extract operations list - handle both with and without line breaks
#         operations_match = re.search(r"OPERATIONS:\s*(\[.*?\])", assistant_response, re.DOTALL)
#         if operations_match:
#             ops_str = operations_match.group(1)
#             try:
#                 # Clean up the string before parsing
#                 cleaned_ops = ops_str.replace("'", '"').replace('\n', '').strip()
#                 operations = ast.literal_eval(cleaned_ops)
#             except:
#                 # Try extracting individual operations using regex
#                 op_matches = re.findall(r"['\"]([^'\"]*?=[^'\"]*?)['\"]", ops_str)
#                 if op_matches:
#                     operations = op_matches
        
#         # Extract the result value
#         result_match = re.search(r"RESULT:\s*(\d+\.?\d*)", assistant_response)
#         if result_match:
#             try:
#                 final_value = float(result_match.group(1))
#                 if final_value.is_integer():
#                     final_value = int(final_value)
#             except:
#                 final_value = None
#     else:
#         # Check for operations pattern even without SOLUTION header
#         op_matches = re.findall(r"(\d+\s*[\+\-\*/]\s*\d+\s*=\s*\d+)", assistant_response)
#         if op_matches:
#             operations = op_matches
            
#             # Check for a clear confirmation message
#             if re.search(r"that works|success!|solved|got it", assistant_response, re.IGNORECASE):
#                 solved = True
                
#                 # Try to extract the result value
#                 result_match = re.search(r"(\d+)\s*$", assistant_response)
#                 if result_match:
#                     try:
#                         final_value = int(result_match.group(1))
#                     except:
#                         final_value = None
#                 else:
#                     # If no explicit result, check if target appears near the end
#                     if str(target) in assistant_response[-100:]:
#                         final_value = target
            
#         # Check for explicit failure messages
#         if re.search(r"not possible|cannot|impossible|no solution", assistant_response, re.IGNORECASE):
#             solved = False
    
#     # If the operations are valid and the final operation results in the target, mark as solved
#     if operations and not final_value:
#         for op in operations:
#             result_match = re.search(r"=\s*(\d+)$", op)
#             if result_match and int(result_match.group(1)) == target:
#                 final_value = target
#                 solved = True
    
#     # Validate operations if they exist
#     if solved and operations:
#         valid = validate_operations(initial_numbers, operations, target)
#         solved = valid and (final_value == target)
    
    
#     return {
#         'solved': solved,
#         'target': target,
#         'initial_numbers': initial_numbers,
#         'operations': operations,
#         'final_value': final_value
#     }

def evaluate_countdown_trajectory_claude(ds_entry) -> Tuple[bool, str]:
    """
    Given:
      - target: the desired final result.
      - nums: list of initial numbers.
      - trajectory: a string that should include lines of the form:
            SOLUTION: YES/NO
            OPERATIONS: [list of strings like 'A+B=C', ...]
            RESULT: final_value
    This function uses regex to extract the parts and then simulates the operations
    by “consuming” initial numbers (and intermediate results) from a pool, verifying:
      • each operation string has one binary operator
      • each operation is valid (left op right equals given result)
      • the operations use all initial numbers exactly once (by simulating removal from pool).
    Returns a tuple (is_valid, message) where is_valid is True only if the trajectory is correct.
    """
    # target: int, nums: List[int], trajectory: str
    trajectory = ds_entry['completion']
    target = ds_entry['target']
    nums = ds_entry['nums']
    # Extract SOLUTION (YES or NO)
    sol_match = re.search(r"SOLUTION:\s*(YES|NO)", trajectory, re.IGNORECASE)
    if not sol_match:
        return False, "Could not find SOLUTION declaration."
    sol_decl = sol_match.group(1).upper()
    if sol_decl != "YES":
        return False, "The trajectory indicates no valid solution."

    # Extract OPERATIONS list
    ops_match = re.search(r"OPERATIONS:\s*(\[[^\]]*\])", trajectory, re.DOTALL)
    if not ops_match:
        return False, "Could not find OPERATIONS list."
    try:
        operations = ast.literal_eval(ops_match.group(1))
        if not isinstance(operations, list):
            return False, "OPERATIONS is not a list."
    except Exception as e:
        return False, f"Failed to parse OPERATIONS list: {e}"
        
    # Extract final RESULT
    res_match = re.search(r"RESULT:\s*([-+.\d]+)", trajectory)
    if not res_match:
        return False, "Could not find RESULT."
    try:
        expected_final = float(res_match.group(1))
    except Exception as e:
        return False, f"Failed to parse RESULT: {e}"
    
    # Simulation: available numbers (as floats).
    available = [float(n) for n in nums]
    # We simulate the sequence of operations.
    # Each operation must be of the form "operand1 operator operand2 = result"
    op_pattern = re.compile(r"^\s*([\d.]+)\s*([\+\-\*/])\s*([\d.]+)\s*=\s*([\d.]+)\s*$")
    
    for idx, op_str in enumerate(operations):
        m = op_pattern.match(op_str)
        if not m:
            return False, f"Operation '{op_str}' does not match required pattern."
        op1_str, operator, op2_str, given_result_str = m.groups()
        try:
            op1 = float(op1_str)
            op2 = float(op2_str)
            op_result = float(given_result_str)
        except Exception as e:
            return False, f"Error converting numbers in op '{op_str}': {e}"
            
        # Check that the operation is valid:
        if operator == '+':
            computed = op1 + op2
        elif operator == '-':
            computed = op1 - op2
        elif operator == '*':
            computed = op1 * op2
        elif operator == '/':
            # Avoid division by zero
            if math.isclose(op2,0.0):
                return False, f"Division by zero in op '{op_str}'."
            computed = op1 / op2
        else:
            return False, f"Unknown operator '{operator}' in op '{op_str}'."
            
        if not math.isclose(computed, op_result, rel_tol=1e-5):
            return False, f"In op '{op_str}', computed {computed} which does not match given {op_result}."
            
        # Now simulate consumption:
        # For each operand, check if it is "available". If yes, remove one instance.
        # (We assume that if the operand equals a value in available within tolerance,
        #  it comes from the pool of that operand.)
        def consume(value: float, pool: List[float]) -> bool:
            for i, num in enumerate(pool):
                if math.isclose(num, value, rel_tol=1e-5):
                    del pool[i]
                    return True
            return False
        
        # Try to consume op1 and op2
        if not consume(op1, available):
            return False, f"Operand {op1} in op '{op_str}' not available from initial/intermediate numbers."
        if not consume(op2, available):
            return False, f"Operand {op2} in op '{op_str}' not available from initial/intermediate numbers."
        # Append current operation result to available pool
        available.append(op_result)
    
    # At the end, exactly one number should remain; it should equal the target.
    if len(available) != 1:
        return False, f"After all operations, expected one value but got {len(available)} values: {available}"
    if not math.isclose(available[0], float(target), rel_tol=1e-5):
        return False, f"Final value {available[0]} does not equal the target {target}."
    
    return True, f"Trajectory is valid with operations: {operations}"
    
#     return {
#         'solved': solved,
#         'target': target,
#         'initial_numbers': initial_numbers,
#         'operations': operations,
#         'final_value': final_value
#     }

def evaluate_countdown_trajectory(ds_entry):
    solved, remarks = evaluate_countdown_trajectory_claude(ds_entry)
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(ds_entry['completion'])

    return {
        'solved': solved,
        'target': ds_entry['target'],
        'initial_numbers': ds_entry['nums'],
        'remarks': remarks,
        'completion_length': len(tokens),
    }
        
def evaluate_countdown_trajectories(results_all_trials):
    """
    Evaluates a dataset of countdown problem solver trajectories and returns a flattened
    list of results in the format expected by visualization tools and wandb logging.
    """
    # Create a dict to track which questions were solved in any trial
    question_solved = {i: False for i in range(len(results_all_trials[0]))}
    trial_success_rates = []
    
    # Process each trial
    for i in range(len(results_all_trials)):
        successes_in_trial = 0
        for j in range(len(results_all_trials[i])):
            # problem_text = results_all_trials[i][j].get("prompt", "")
            # solution_text = results_all_trials[i][j].get("completion", "")
            results_all_trials[i][j]['parsed_results'] = evaluate_countdown_trajectory(results_all_trials[i][j])
            
            # Update success counts
            if results_all_trials[i][j]['parsed_results']['solved']:
                successes_in_trial += 1
                question_solved[j] = True
                
        # Calculate success rate for this trial
        trial_success_rates.append(successes_in_trial / len(results_all_trials[i]))
    
    # Calculate aggregate statistics
    best_of_n_successes = sum(question_solved.values())
    best_of_n_rate = best_of_n_successes / len(question_solved)
    mean_of_n_trials = sum(trial_success_rates) / len(trial_success_rates)
    
    # Print summary statistics
    print(f"Success rate for each trial: {trial_success_rates}")
    print(f"\nSummary:")
    print(f"  Best-of-{len(trial_success_rates)} success rate: {best_of_n_rate:.4f} ({best_of_n_successes}/{len(question_solved)})")
    print(f"  Mean success rate across trials: {mean_of_n_trials:.4f}\n")
    
    # Create the properly formatted results array
    # Start with the metrics summary (which will be at index 1 after hyperparams are added)
    final_results = [{
        'trial_success_rates': trial_success_rates,
        'best_of_n': best_of_n_rate,
        'mean': mean_of_n_trials
    }]
    
    # Add individual trajectory results (using best results from any trial)
    for j in range(len(results_all_trials[0])):
        # Find the best result for this problem across all trials
        best_result = None
        for i in range(len(results_all_trials)):
            result = results_all_trials[i][j]
            if best_result is None or (result['parsed_results']['solved'] and not best_result['parsed_results']['solved']):
                best_result = result
        
        # Add this result to our final list
        if best_result:
            final_results.append(best_result)
    
    return final_results


In [99]:
from datasets import load_dataset
# Load the dataset
print("Loading dataset from Hugging Face...")
data = load_dataset("MelinaLaimon/stream-of-search", split="train[0%:80%]")

import json


Loading dataset from Hugging Face...


In [100]:
# load res/output.jsonl
import sys, os, json

os.listdir("./")

with open("../res/output.jsonl", "r") as f:
    lines = f.readlines()
    lines = [json.loads(line) for line in lines]
    
"""
{'id': '74b966fb-9632-472e-8e76-07b5932e7ffd',
 'original_idx': 14,
 'messages': [{'content': "Combine these initial numbers [12, 91, 40, 13] using only arithmetic operations (+, -, *, /) to reach the target value 45. All initial numbers must be used exactly once.\nConclude with the final result in EXACTLY this format:\n```\nSOLUTION: YES/NO\nOPERATIONS: list of string of operations performed, each string involving only 1 operation. For example, ['A+B=C','C+D=E'] is allowed, ['A+B+D=E'] is not allowed\nRESULT: final_value\n```\n\nNote that the solution does exist. Verify your solutions before your present your final results and backtrack to correct mistakes from before your mistakes if you have to.",
   'role': 'user'},
  {'role': 'assistant',
   'content': "```\nSOLUTION: YES\nOPERATIONS: ['91/13=7', '40-7=33', '33+12=45']\nRESULT: 45\n```",
   'reasoning_content': "Okay, let's see. I need to combine the numbers 12, 91, 40, and 13 using each exactly once, and reach 45. The allowed operations are addition, subtraction, multiplication, and division. Hmm. Let's start by thinking about possible combinations.\n\nFirst, since 45 is a relatively small number, maybe some subtraction or division is needed here. Let's list the numbers again: 12, 91, 40, 13. Let's think about possible pairs.\n\nWhat if I try to subtract two larger numbers to get a smaller one. For example, 91 minus 40 is 51. Then maybe combine with 13 and 12. But 51 plus 13 is 64, minus 12 is 52. Not 45. Hmm. Alternatively, 91 minus 13 is 78. Then 40 minus 12 is 28. 78 minus 28 is 50. Still not there.\n\nWait, maybe division is necessary. Let's think. For example, 40 divided by something. 40 / 10 is 4, but we don't have 10. Alternatively, 91 divided by something. Let's see. 91 divided by 13 is 7. Oh, that's interesting. Because 13 is one of the numbers. So 91 /13 =7. Then we have 12, 40, and 7 left. How can we combine those to get 45?\n\nSo 40 + 7 =47, then 47 - 12=35. No. Not 45. Alternatively, 40 -7=33. 33 +12=45. Oh! Wait, that would work. Let's check:\n\nStep 1: 91 /13 =7. Then we have 12,40,7 left. Then 40 -7=33. Then 33 +12=45. So all numbers are used exactly once: 91,13,40,12. Let's verify:\n\nFirst operation: 91 /13 =7.\n\nSecond operation: 40 -7=33. But wait, 7 is a result from the first operation, not the original numbers. So that uses 40 and 7 (which is the result of 91/13). Then third operation is 33 +12=45. So yes, all original numbers are used: 12,91,40,13. Each used exactly once. Let me check each step:\n\nOriginal numbers: 12,91,40,13.\n\nFirst step: 91 /13=7. So 91 and 13 are used. Remaining numbers:12,40, and the result 7.\n\nSecond step: 40 -7=33. Now, 40 is used. Remaining numbers:12 and 33 (from previous step).\n\nThird step: 33 +12=45. So 12 is used. All original numbers are accounted for: 91,13,40,12. Yes.\n\nSo the operations would be:\n\n['91/13=7', '40-7=33', '33+12=45'].\n\nThe final result is 45, which matches the target. So that works."}]}
"""

# flip it so when we index "original_idx" we get line['messages'][1]['reasoning_content'] + "\n" + line['messages'][1]['content']

text_only = [None] * len(lines)
for line in lines:
    idx = line['original_idx']
    text_only[idx] = (line['messages'][1]['reasoning_content'] + "\n" + line['messages'][1]['content'])
    
# remove None
text_only = [p for p in text_only if p is not None]

# check len
# len(placeholder), len(lines), len(lines) - len(placeholder)

# if "completion" not in data.features.keys():
data = data.add_column("completion", text_only)
    
data = data.map( lambda x: {
    'messages_deepseek': [{'role': 'user', 'content': x['completion']}]
})
data = data.select_columns(["messages_deepseek", "nums", "target", "solution", "completion"])
data = data.add_column("parsed_results", list(map(evaluate_countdown_trajectory, data)))

In [101]:
results = sum([d['parsed_results']['solved'] for d in data]) / len(data)
print(results)

data_rejection_sampled = data.filter(lambda x: x['parsed_results']['solved'])
data_rejection_sampled = data_rejection_sampled.rename_column("messages_deepseek", "messages")

0.7895


Filter:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [102]:
print("Loading dataset from Hugging Face...")
dataset_test = load_dataset("MelinaLaimon/stream-of-search", split="test").select(range(1000))

dataset_total = {}

# Prepare test dataset
dataset_test = dataset_test.select_columns(["messages_sos", "nums", "target", "solution"])
dataset_test = dataset_test.rename_column("messages_sos", "messages")
dataset_total["test"] = dataset_test

Loading dataset from Hugging Face...


In [None]:
from datasets import concatenate_datasets
data_sos = load_dataset("MelinaLaimon/stream-of-search", split="train").select(range(5000))
data_sos = data_sos.select_columns(["messages_sos", "nums", "target", "solution"])
data_sos = data_sos.rename_column("messages_sos", "messages")

dataset_total["train"] = concatenate_datasets([data_rejection_sampled.select(range(5000)), data_sos])
dataset_total["train"] = dataset_total["train"].shuffle(seed=42)


In [104]:
dataset_total["train"] = dataset_total["train"].select_columns(["messages", "nums", "target", "solution"])

In [109]:
dataset_total["train"][8000]

{'messages': [{'content': "Okay, let's see. I need to combine the numbers 33, 88, 8, and 65 using each exactly once with arithmetic operations to reach 43. Hmm. All numbers must be used, so maybe I can pair them up first and then combine the results. Let me think of different combinations.\n\nFirst, let's list the numbers: 33, 88, 8, 65. The target is 43. So maybe I can find pairs that get me closer. Let's see. For example, 88 and 8. 88 divided by 8 is 11. That's a nice number. Then I have 33 and 65 left. 33 plus 65 is 98. Then 98 minus 11 would be 87. Not 43. Alternatively, maybe 65 minus 33 is 32. 32 plus 11 is 43. Oh, wait! Wait a minute. Let's check that again.\n\nIf I do 88 / 8 = 11. Then take 65 - 33 = 32. Then add 32 + 11 = 43. That uses all four numbers. Let me verify each step. First operation: 88 divided by 8 equals 11. Then 65 minus 33 equals 32. Then 32 plus 11 equals 43. Yes, that uses all four numbers once each. So the operations would be:\n\n88 / 8 = 11 (using 88 and 8)\

In [71]:
from datasets import concatenate_datasets
data_concat = concatenate_datasets([data, data_sos])

In [93]:
dataset_total

Dataset({
    features: ['messages', 'nums', 'target', 'solution'],
    num_rows: 10000
})

In [92]:
from datasets import DatasetDict
new_ds = DatasetDict(dataset_total)
new_ds.push_to_hub("yeok/stream-of-search-dataset_deepseek_and_sos")

ValueError: dictionary update sequence element #0 has length 4; 2 is required

In [42]:
dataset_test

Dataset({
    features: ['messages', 'nums', 'target', 'solution'],
    num_rows: 1000
})

In [29]:
# rejection sample if d['parsed_results']['solved'] then remove from data
# data_rejection_sampled = data.filter(lambda x: x['parsed_results']['solved'])

data_rejection_sampled = data.filter(lambda x: x['messages_deepseek'] != "")






Filter:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [23]:
data = data.select_columns(["messages_deepseek", "nums", "target", "solution"])

In [30]:
dataset_test.select_columns(["messages_sos", "nums", "target", "solution"])

Dataset({
    features: ['messages_sos', 'nums', 'target', 'solution'],
    num_rows: 1000
})

In [32]:
data_rejection_sampled['messages_deepseek'][-5]

[{'content': "Okay, so I need to combine the numbers 72, 45, 15, and 93 using each exactly once with arithmetic operations to reach 35. Hmm. Let me think. All numbers must be used, and each operation combines two numbers into one, so with four numbers, I need three operations. Let's see.\n\nFirst, maybe I can try different groupings. Let's consider the operations step by step. Let's start by pairing some numbers. For example, if I subtract two numbers to get a smaller value. Like 72 minus 45 would be 27. Then I have 27, 15, 93 left. Hmm, not sure how to proceed from there.\n\nAlternatively, 93 minus something. Let's see. 93 minus 15 is 78. Then we have 72, 45, 78. But then 78 minus 72 is 6, and 6 minus 45 is -39. Not helpful.\n\nWait, maybe division is necessary here. Let's see. For instance, 45 divided by 15 is 3. Then we have 72, 3, 93. Then 72 divided by 3 is 24. Then 24 plus 93 is 117. No, that's too big. Not helpful.\n\nWhat about 72 divided by something? Let's think. If I take 72

In [None]:
print("Loading dataset from Hugging Face...")
dataset_test = load_dataset("MelinaLaimon/stream-of-search", split="test").select(range(1000))

dataset_total = {}

# Prepare test dataset
dataset_test = dataset_test.select_columns(["messages_sos", "nums", "target", "solution"])
dataset_test = dataset_test.rename_column("messages_sos", "messages")
dataset_total["test"] = dataset_test

# Prepare DeepSeek dataset
dataset_total["train_deepseek"] = data_rejection_sampled.select_columns(["messages_deepseek", "nums", "target", "solution"])
dataset_total["train_deepseek"] = dataset_total["train_deepseek"].rename_column("messages_deepseek", "messages")

# Create a DatasetDict
from datasets import DatasetDict
new_ds = DatasetDict(dataset_total)


Loading dataset from Hugging Face...


In [36]:
new_ds.push_to_hub("yeok/stream-of-search-dataset_deepseek_rejection_sampled")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



CommitInfo(commit_url='https://huggingface.co/datasets/yeok/stream-of-search-dataset_deepseek_rejection_sampled/commit/16305988035a5c4a524a11fd977dfc8dc659cb40', commit_message='Upload dataset', commit_description='', oid='16305988035a5c4a524a11fd977dfc8dc659cb40', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yeok/stream-of-search-dataset_deepseek_rejection_sampled', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yeok/stream-of-search-dataset_deepseek_rejection_sampled'), pr_revision=None, pr_num=None)

In [32]:
# only keep columns 'messages_deepseek', 'nums', 'target' 'solution
# map messages_deepseek to messages
data = data.map(lambda x: {
    'completion': x['messages_deepseek'][0]['content'],
    # 'nums': x['nums'],
    # 'target': x['target'],
    # 'solution': x['solution']
})

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from src.result_parsers.countdown_trajectories import evaluate_countdown_trajectory

data = data.add_column("parsed_results", list(map(evaluate_countdown_trajectory, data)))

results = sum([d['solved'] for d in data['parsed_results']]) / len(data['parsed_results'])

In [1]:
from datasets import load_dataset

dataset = load_dataset("yeok/stream-of-search-dataset_deepseek_rejection_sampled")

In [None]:
# rename split train_deepseek into train
ds = dict(dataset)
ds['train'] = ds['train_deepseek'] 
del ds['train_deepseek']

new_ds.push_to_hub("yeok/stream-of-search-dataset_deepseek_rejection_sampled_5k")

In [None]:
new_ds.push_to_hub("yeok/stream-of-search-dataset_deepseek_rejection_sampled_5k")

{'test': Dataset({
     features: ['messages', 'nums', 'target', 'solution'],
     num_rows: 1000
 }),
 'train': Dataset({
     features: ['messages', 'nums', 'target', 'solution'],
     num_rows: 6316
 })}