In [26]:
from datasets import load_dataset
from huggingface_hub import login
import tiktoken
from src.result_parsers.countdown_trajectories import evaluate_countdown_trajectory_claude

# Load the dataset
dataset = load_dataset("MelinaLaimon/stream-of-search")

def countdown_trajectory_is_correct(nums, target, trajectory):
    solved, remarks = evaluate_countdown_trajectory_claude(dict(nums=nums, target=target, completion=trajectory))
    return solved

message_columns = [
    'messages_sos_react',
    'messages_sos',
    'messages_optimal',
    'messages_deepseek_r1_distill_llama_70b',
    'messages_deepseek'
]

encoding_name = "cl100k_base"
encoding = tiktoken.get_encoding(encoding_name)

def get_token_length(text):
    if text is None:
        return 0
    return len(encoding.encode(text))

def add_is_correct_and_length_columns(example):
    new_example = {}
    for col in message_columns:
        trajectory = None
        if col in example and isinstance(example[col], list) and len(example[col]) > 1 and "content" in example[col][1]:
            trajectory = example[col][1]["content"]

        is_correct = False
        if trajectory is not None:
            is_correct = countdown_trajectory_is_correct(
                example["nums"],
                example["target"],
                trajectory
            )

        message_type_name = col.replace("messages_","")
        new_example[f"is_correct_{message_type_name}"] = is_correct
        new_example[f"token_length_{message_type_name}"] = get_token_length(trajectory)
    return new_example

# Process the 'train' split
if "train" in dataset:
    train_data = dataset["train"]
    train_data_with_correctness_and_length = train_data.map(add_is_correct_and_length_columns)
else:
    train_data_with_correctness_and_length = None
    print("Warning: 'train' split not found in the dataset.")

# Process the 'test' split (assuming it exists)
if "test" in dataset:
    test_data = dataset["test"]
    test_data_with_correctness_and_length = test_data.map(add_is_correct_and_length_columns)
else:
    test_data_with_correctness_and_length = None
    print("Warning: 'test' split not found in the dataset.")

# --- Pushing to Hugging Face Hub ---

# 1. Log in to your Hugging Face account.
# You can do this interactively in your notebook or terminal:
# from huggingface_hub import notebook_login
# notebook_login()
#
# Or using your token directly (less secure for shared environments):
# login(token="YOUR_HUGGINGFACE_TOKEN")

# 2. Define the repository ID (your username/the name of the dataset repository)
repo_id = "MelinaLaimon/stream-of-search-with-stats"  # Choose a new name

# 3. Push the modified datasets to the Hub.
try:
    if train_data_with_correctness_and_length:
        train_data_with_correctness_and_length.push_to_hub(repo_id, split="train")
        print(f"Processed 'train' split pushed to: https://huggingface.co/datasets/{repo_id}/tree/main/train")
    if test_data_with_correctness_and_length:
        test_data_with_correctness_and_length.push_to_hub(repo_id, split="test")
        print(f"Processed 'test' split pushed to: https://huggingface.co/datasets/{repo_id}/tree/main/test")
    if not train_data_with_correctness_and_length and not test_data_with_correctness_and_length:
        print("No splits were processed.")

except Exception as e:
    print(f"An error occurred while pushing to the Hub: {e}")
    print("Make sure you are logged in with the correct permissions and the repository exists.")

Using the latest cached version of the dataset since MelinaLaimon/stream-of-search couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/melinajingtinglaimon/.cache/huggingface/datasets/MelinaLaimon___stream-of-search/default/0.0.0/2f4d36ac39cfd69e331bf3e07fb690edad0b9d5a (last modified on Thu Apr  3 11:24:09 2025).
Map: 100%|██████████| 10000/10000 [00:58<00:00, 170.46 examples/s]
Map: 100%|██████████| 2000/2000 [00:11<00:00, 172.99 examples/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  9.13ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.19s/it]


Processed 'train' split pushed to: https://huggingface.co/datasets/MelinaLaimon/stream-of-search-with-stats/tree/main/train


Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  7.98ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.87s/it]


Processed 'test' split pushed to: https://huggingface.co/datasets/MelinaLaimon/stream-of-search-with-stats/tree/main/test


In [None]:
dataset_with_stats = load_dataset("MelinaLaimon/stream-of-search-with-stats", split="train")
num_samples = 5000
dataset_with_stats = dataset_with_stats.filter(lambda x: x["is_correct_sos_react"]).sort("token_length_sos_react").select(range(num_samples))

In [40]:
from datasets import load_dataset, Dataset

# 1. Load the dataset
dataset_with_stats = load_dataset("MelinaLaimon/stream-of-search-with-stats", split="train")

# 2. Filter, sort, and select
columns_to_push = [
    "nums",
    "target",
    "search_type",
    "heuristic",
    "messages_sos_react",
    "token_length_sos_react",
    # Add other columns you want to keep
]
num_samples = 5000
dataset_filtered = dataset_with_stats.filter(lambda x: x["is_correct_sos_react"])
dataset_sorted = dataset_filtered.sort("token_length_sos_react")
dataset_final = dataset_sorted.select(range(num_samples)).select_columns(columns_to_push)

# 3. Push to the Hub
new_repo_id = "MelinaLaimon/stream-of-search-react-correct-5k"  # Your desired repository ID
dataset_final.push_to_hub(new_repo_id)

print(f"Dataset successfully pushed to {new_repo_id}")

Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 27.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


Dataset successfully pushed to MelinaLaimon/stream-of-search-react-correct-5k


In [None]:
import json
import os
from datasets import Dataset
folder_path = "data/sos_10k_b4_merged" 
sos_only_file = "train1_b4_t100_n10000.json"
llm_teacher_file = "train1_b4_t100_n10000_w_deepseek_31mar.json"
with open(os.path.join(folder_path, sos_only_file)) as f:
    sos_data = json.load(f)
with open(os.path.join(folder_path, llm_teacher_file)) as f:
    llm_data = json.load(f)

llm_fields = ["messages_deepseek_r1_distill_llama_70b", "messages_deepseek"]

for i in range(10000):
    for field in llm_fields:
        sos_data[i][field] = llm_data[i][field]

sos_train = Dataset.from_list(sos_data)

# 3. Push to the Hub
sos_repo_id = "MelinaLaimon/stream-of-search"  # Your desired repository ID
sos_train.push_to_hub(sos_repo_id, split="train")

In [67]:

for test_file in ["test_target1_b4_t100_n10000.json","test1_b4_t100_n10000.json"]:
    with open(os.path.join(folder_path, test_file)) as f:
        test_data = json.load(f)
    test_dataset = Dataset.from_list(test_data)
    for field in llm_fields:
        test_dataset = test_dataset.add_column(field, [[{"content":"","role":""}]]*1000)
    test_dataset.push_to_hub(sos_repo_id, split=test_file.replace("1_b4_t100_n10000.json",""))
    # print(test_file.replace("_b4_t100_n10000.json",""))

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 17.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


In [76]:
from datasets import DownloadMode
dataset = load_dataset("MelinaLaimon/stream-of-search", download_mode=DownloadMode.FORCE_REDOWNLOAD)

Generating train split: 100%|██████████| 10000/10000 [00:01<00:00, 5883.76 examples/s]
Generating test_target split: 100%|██████████| 1000/1000 [00:00<00:00, 13759.67 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 17289.33 examples/s]


In [77]:
dataset["train"][401]

{'nums': [56, 10, 80, 35],
 'target': 40,
 'solution': ['80*35=2800', '2800/56=50', '50-10=40'],
 'rating': 0.7022569444444444,
 'search_type': 'dfs',
 'heuristic': 'sum_heuristic',
 'messages_optimal': [{'content': "Combine these initial numbers [56, 10, 80, 35] using only arithmetic operations (+, -, *, /) to reach the target value 40. All initial numbers must be used exactly once.\nConclude with the final result in EXACTLY this format:\n```\nSOLUTION: YES/NO\nOPERATIONS: list of string of operations performed, each string involving only 1 operation. For example, ['A+B=C','C+D=E'] is allowed, ['A+B+D=E'] is not allowed\nRESULT: final_value\n```\n",
   'role': 'user'},
  {'content': "Current State: 40:[56, 10, 80, 35], Operations: []\nExploring Operation: 80*35=2800, Resulting Numbers: [56, 10, 2800]\nGenerated Node #2: [56, 10, 2800] from Operation: 80*35=2800\nCurrent State: 40:[56, 10, 2800], Operations: ['80*35=2800']\nExploring Operation: 2800/56=50, Resulting Numbers: [10, 50]\n