# EDA PRM800K Dataset

In [1]:
# Setup and Imports
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as scp
# import plotly.express as px
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import polyfit

## Load Datasets

In [2]:
prm800k_train = pd.read_csv("dataset/prm800k-train.csv")
prm800k_train

Unnamed: 0,labeler,timestamp,generation,is_quality_control_question,is_initial_screening_question,question,label
0,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-13T18:55:54.496450,,False,False,{'problem': 'How many seconds are in 7.8 minut...,{'steps': [{'completions': [{'text': '7.8 minu...
1,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-17T16:56:51.323252,,False,False,{'problem': 'How many positive two-digit integ...,"{'steps': [{'completions': [{'text': ""Let's ca..."
2,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-02T18:33:27.255302,,False,False,{'problem': 'The fifth and eighth terms of a g...,{'steps': [{'completions': [{'text': 'So we ha...
3,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-17T16:56:53.345085,,False,False,{'problem': 'Find the value of $x$ that satisf...,"{'steps': [{'completions': [{'text': ""Let's fi..."
4,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-17T16:59:27.072495,,False,False,{'problem': 'What is the least three-digit who...,{'steps': [{'completions': [{'text': 'So we wa...
...,...,...,...,...,...,...,...
98726,340d89bc-f5b7-45e9-b272-909ba68ee363,2023-02-04T04:04:19.560459,9.0,False,False,{'problem': 'Find the largest possible value o...,{'steps': [{'completions': [{'text': 'This is ...
98727,340d89bc-f5b7-45e9-b272-909ba68ee363,2023-02-04T03:29:16.886449,9.0,False,False,"{'problem': 'Alicia has $n$ candies, where $n$...",{'steps': [{'completions': [{'text': 'I need t...
98728,2b794369-78c8-4e28-b3ba-eb0dd4a9ab2c,2023-02-03T23:46:18.766691,9.0,False,False,{'problem': 'Find the remainder when $$33818^2...,{'steps': [{'completions': [{'text': 'This pro...
98729,2b794369-78c8-4e28-b3ba-eb0dd4a9ab2c,2023-02-03T23:43:33.397886,9.0,False,False,{'problem': 'On the Cartesian plane in which e...,{'steps': [{'completions': [{'text': 'I need t...


In [3]:
prm800k_test = pd.read_csv("dataset/prm800k-test.csv")
prm800k_test

Unnamed: 0,labeler,timestamp,generation,is_quality_control_question,is_initial_screening_question,question,label
0,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-30T23:23:32.118856,,False,False,{'problem': 'Three pencils and a jumbo eraser ...,"{'steps': [{'completions': [{'text': ""Let's ca..."
1,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T21:11:38.359626,,False,False,"{'problem': 'Steve says to Jon, ""I am thinking...","{'steps': [{'completions': [{'text': ""Hey, you..."
2,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T21:14:58.004130,,False,False,{'problem': 'Compute $58_9 - 18_9.$ Express yo...,{'steps': [{'completions': [{'text': 'I think ...
3,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-30T23:38:08.623029,,False,False,{'problem': 'What is $\\sqrt{53}$ in simplest ...,"{'steps': [{'completions': [{'text': ""Let's fi..."
4,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-31T01:42:39.680550,,False,False,{'problem': 'What is the sum of all of the mul...,{'steps': [{'completions': [{'text': 'I think ...
...,...,...,...,...,...,...,...
2863,423ec391-7def-48bd-821a-5e231333d6ce,2022-11-10T21:59:40.659723,6.0,False,False,{'problem': 'What is the integer value of $x$ ...,{'steps': [{'completions': [{'text': 'An arith...
2864,30f0b980-2587-4ec3-957a-0ca2bf958c40,2022-11-10T21:59:39.648690,6.0,False,False,{'problem': 'The binary number $10101001110_{2...,{'steps': [{'completions': [{'text': 'To conve...
2865,909003eb-d16e-49c8-82ce-7664180c66b8,2022-11-10T21:59:14.778080,6.0,False,False,{'problem': 'The binary number $10101001110_{2...,{'steps': [{'completions': [{'text': 'I know t...
2866,a2e09d3a-2da6-4094-89ca-e92125f6aefd,2022-11-10T21:58:33.614002,6.0,False,False,{'problem': 'Simplify $\\frac{(10r^3)(4r^6)}{8...,{'steps': [{'completions': [{'text': 'To simpl...


## PRM800K Fields

| Column                          | Data Type       | Description                                                                                                                                                              |
| ------------------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labeler`                       | `str` (UUID)    | Unique identifier of the annotator or system that labeled the reasoning steps.                                                                                           |
| `timestamp`                     | `datetime`      | Timestamp indicating when the reasoning or labeling was performed. Useful for batch or temporal analysis.                                                                |
| `generation`                    | `str` or `null` | (Optional) May contain the ID of the generative model that produced the reasoning steps. Can be `null` for human-generated responses.                                    |
| `is_quality_control_question`   | `bool`          | `True` if the question is used for quality control purposes, e.g., testing annotator reliability.                                                                        |
| `is_initial_screening_question` | `bool`          | `True` if the question was used for screening labelers before they start actual annotation tasks.                                                                        |
| `question`                      | `dict`          | Contains the main question data, typically with keys:<br>• `problem`: the math problem in LaTeX or plain text<br>• `ground_truth_answer`: the correct answer as a string |
| `label`                         | `dict`          | Holds reasoning steps and associated labels. This includes a list of reasoning `steps`, each with several completions and possibly human feedback.                       |

The label['steps'] field contains a list of reasoning steps. Each step follows this structure:
```
{
  "completions": [
    {
      "text": "Some reasoning text...",
      "rating": 1,
      "flagged": false
    },
    ...
  ],
  "human_completion": {
    "text": "Manual correction or insight",
    "source": "human",
    "corrected_rating": null,
    "flagged": false
  },
  "chosen_completion": 2
}
```
Fields Explained
- completions: Multiple reasoning candidates for a given step.
- text: The reasoning text generated by a model or human.
- rating: Quality label of the reasoning:
```
1: correct
0: redundant
-1: incorrect
```
- flagged: Boolean indicating problematic or inappropriate completions.
- chosen_completion: Index of the preferred or accepted completion for the step.
- human_completion (optional): Manual input by a human evaluator to correct or guide the reasoning.

## Feature Engineering for PRM800K

Due to the lack of visibility caused by the deeply nested JSON format in the label column, especially the steps field. I have added several new columns to make exploration and analysis of PRM800K easier.

| **Column**                         | **Description**                                                                                                                                     |
|-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
| **`total_steps`**                 | Total number of reasoning steps per question.                                                                                                       |
| **`total_steps_have_neg_1`**      | Total number of steps that include at least one completion rated **-1** (indicating incorrect or misleading reasoning).                           |
| **`steps_neg_1`**                 | List of step indices (e.g., `[1, 2, 3, 4]`) where at least one completion has a rating of **-1**.                                                  |
| **`total_neg_1_sequence_from_last_step`** | Number of consecutive steps (counting backward from the last step) that contain at least one **-1** rating. Useful for analyzing late degradation. |
| **`total_steps_have_zero`**       | Total number of steps that include at least one completion rated **0** (neutral or uninformative reasoning).                                       |
| **`steps_zero`**                  | List of step indices where at least one completion has a rating of **0**.                                                                          |
| **`total_steps_have_pos_1_or_human`**      | Total number of steps that include at least one completion rated **1** (good or helpful reasoning) or human completion.                                                |
| **`steps_pos_1_or_human`**                 | List of step indices where at least one completion has a rating of **1** or human completion.                                                                          |
| **`total_steps_have_combination`**| Total number of steps that contain **all three types of ratings**: **-1**, **0**, and **1/human**. Helps identify steps with diverse or conflicting judgment. |
| **`finish_reason`**               | Original `finish_reason` field from the dataset, indicating how the reasoning process was concluded (e.g., `"solution"`, `"give_up"`).             |