# Retrieve PRM800K Dataset

## Setup

In [1]:
# Setup and Imports
import pandas as pd
import textgrad as tg
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Datasets

In [2]:
# Dataset Loading Function
def load_prm800k_safely():
    """Load PRM800K with error handling"""
    
    try:
        print("Attempting to load PRM800K dataset...")
        
        # Loading with streaming first (safer for large datasets)
        dataset = load_dataset("tasksource/PRM800K", streaming=True)
        print("[V] Successfully loaded PRM800K in streaming mode")
        return dataset, "streaming"
        
    except Exception as e:
        print(f"[X] Streaming failed: {e}")

In [3]:
# Load dataset
dataset_stream, load_method = load_prm800k_safely()

Attempting to load PRM800K dataset...
[V] Successfully loaded PRM800K in streaming mode


In [4]:
dataset_stream

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        num_shards: 2
    })
    test: IterableDataset({
        features: Unknown,
        num_shards: 2
    })
})

In [5]:
samples = []
for example in dataset_stream["train"].take(3):
    samples.append(example)

df_samples = pd.DataFrame(samples)
df_samples

Unnamed: 0,labeler,timestamp,generation,is_quality_control_question,is_initial_screening_question,question,label
0,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-13T18:55:54.496450,,False,False,{'problem': 'How many seconds are in 7.8 minut...,{'steps': [{'completions': [{'text': '7.8 minu...
1,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-17T16:56:51.323252,,False,False,{'problem': 'How many positive two-digit integ...,"{'steps': [{'completions': [{'text': ""Let's ca..."
2,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-02T18:33:27.255302,,False,False,{'problem': 'The fifth and eighth terms of a g...,{'steps': [{'completions': [{'text': 'So we ha...


In [6]:
df_samples.to_csv('dataset/prm800k.csv', index=False)