In [None]:
import json
from datasets import Dataset, DatasetDict
import random

# Load the FIM-formatted data
input_path = "prompts_transformed_filtered_ver2.jsonl"
with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

# Shuffle data for random split
random.seed(42)
random.shuffle(data)

# Split 90% train, 10% test
split_idx = int(0.9 * len(data))
train_data = data[:split_idx]
test_data = data[split_idx:]

# Ensure all are dicts with 'content'
train_data = [{'content': d['content']} for d in train_data if isinstance(d, dict) and 'content' in d]
test_data = [{'content': d['content']} for d in test_data if isinstance(d, dict) and 'content' in d]


# Create HuggingFace datasets
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Save to disk in HuggingFace format under subset 'tbricks'
dataset_dict.save_to_disk("tbricksnext2linesver5/tbricks")
print(f"Saved HuggingFace dataset to tbricksnext2lines/tbricks with {len(train_data)} train and {len(test_data)} test samples.")
