In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import json
import os
import re
from datasets import load_from_disk

from config import storage_dir

# Setting arguments and paths

In [10]:
# Main args
#model_name = "Qwen3-30B-A3B"
model_name = "meta-llama/Llama-3.3-70B-Instruct"

model_storage_dir = os.path.join(storage_dir, "lm_sys", model_name.split("/")[-1])
response_paths = os.path.join(model_storage_dir, 'lm_sys_responses')

# Processing

In [11]:
# Step 1: Collect directories matching the format "lm_sys_{start_num}_{end_num}"
pattern = r"lm_sys_(\d+)_(\d+)"
directories = []

for dir_name in os.listdir(model_storage_dir):
    match = re.match(pattern, dir_name)
    if match:
        start_num, end_num = map(int, match.groups())
        size = end_num - start_num
        directories.append((dir_name, start_num, end_num, size))

In [12]:
# Step 2: Sort directories by start_num
directories.sort(key=lambda x: x[1])

In [13]:
from datasets import concatenate_datasets

In [14]:
# Step 3: Load datasets, index them, and merge into one dataset
datasets_to_merge = []

for dir_name, start_num, end_num, size in directories:
    dataset_path = os.path.join(model_storage_dir, dir_name)
    dataset = load_from_disk(dataset_path)
    
    # Index the dataset from 0 to size
    dataset = dataset.select(range(size))
    
    # Add the dataset to the list for merging
    datasets_to_merge.append(dataset)

# Merge all datasets into one
merged_dataset = concatenate_datasets(datasets_to_merge)

In [15]:
model_storage_dir

'/n/holylfs06/LABS/krajan_lab/Lab/cfang/encoded_reasoning/lm_sys/Llama-3.3-70B-Instruct'

In [16]:
# Save the merged dataset if needed
merged_dataset.save_to_disk(os.path.join(
    model_storage_dir, 'lm_sys_responses'))

Saving the dataset (0/1 shards):   0%|          | 0/45000 [00:00<?, ? examples/s]

In [17]:
merged_dataset

Dataset({
    features: ['conversation'],
    num_rows: 45000
})