# EDA PRM800K Dataset

In [None]:
# Setup and Imports
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as scp
# import plotly.express as px
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import polyfit

## Load Datasets

In [None]:
prm800k_test = pd.read_csv("datasets/prm800k-test.csv")
prm800k_test

## PRM800K Fields

| Column                          | Data Type       | Description                                                                                                                                                              |
| ------------------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labeler`                       | `str` (UUID)    | Unique identifier of the annotator or system that labeled the reasoning steps.                                                                                           |
| `timestamp`                     | `datetime`      | Timestamp indicating when the reasoning or labeling was performed. Useful for batch or temporal analysis.                                                                |
| `generation`                    | `str` or `null` | (Optional) May contain the ID of the generative model that produced the reasoning steps. Can be `null` for human-generated responses.                                    |
| `is_quality_control_question`   | `bool`          | `True` if the question is used for quality control purposes, e.g., testing annotator reliability.                                                                        |
| `is_initial_screening_question` | `bool`          | `True` if the question was used for screening labelers before they start actual annotation tasks.                                                                        |
| `question`                      | `dict`          | Contains the main question data, typically with keys:<br>• `problem`: the math problem in LaTeX or plain text<br>• `ground_truth_answer`: the correct answer as a string |
| `label`                         | `dict`          | Holds reasoning steps and associated labels. This includes a list of reasoning `steps`, each with several completions and possibly human feedback.                       |

The label['steps'] field contains a list of reasoning steps. Each step follows this structure:
```
{
  "completions": [
    {
      "text": "Some reasoning text...",
      "rating": 1,
      "flagged": false
    },
    ...
  ],
  "human_completion": {
    "text": "Manual correction or insight",
    "source": "human",
    "corrected_rating": null,
    "flagged": false
  },
  "chosen_completion": 2
}
```
Fields Explained
- completions: Multiple reasoning candidates for a given step.
- text: The reasoning text generated by a model or human.
- rating: Quality label of the reasoning:
```
1: correct
0: redundant
-1: incorrect
```
- flagged: Boolean indicating problematic or inappropriate completions.
- chosen_completion: Index of the preferred or accepted completion for the step.
- human_completion (optional): Manual input by a human evaluator to correct or guide the reasoning.

## Feature Engineering for PRM800K

Due to the lack of visibility caused by the deeply nested JSON format in the label column, especially the steps field. I have added several new columns to make exploration and analysis of PRM800K easier.

| **Column**                         | **Description**                                                                                                                                     |
|-----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
| **`total_steps`**                 | Total number of reasoning steps per question.                                                                                                       |
| **`total_steps_have_neg_1`**      | Total number of steps that include at least one completion rated **-1** (indicating incorrect or misleading reasoning).                           |
| **`steps_neg_1`**                 | List of step indices (e.g., `[1, 2, 3, 4]`) where at least one completion has a rating of **-1**.                                                  |
| **`total_neg_1_sequence_from_last_step`** | Number of consecutive steps (counting backward from the last step) that contain at least one **-1** rating. Useful for analyzing late degradation. |
| **`total_steps_have_zero`**       | Total number of steps that include at least one completion rated **0** (neutral or uninformative reasoning).                                       |
| **`steps_zero`**                  | List of step indices where at least one completion has a rating of **0**.                                                                          |
| **`total_steps_have_pos_1_or_human`**      | Total number of steps that include at least one completion rated **1** (good or helpful reasoning) or human completion.                                                |
| **`steps_pos_1_or_human`**                 | List of step indices where at least one completion has a rating of **1** or human completion.                                                                          |
| **`total_steps_have_combination`**| Total number of steps that contain **all three types of ratings**: **-1**, **0**, and **1/human**. Helps identify steps with diverse or conflicting judgment. |
| **`finish_reason`**               | Original `finish_reason` field from the dataset, indicating how the reasoning process was concluded (e.g., `"solution"`, `"give_up"`).             |

### String to Dict (Convert JSON)

In [None]:
import json
import ast

def string_to_dict(input_string):
    input_string = input_string.strip()

    try:
        json_string = input_string.replace("'", '"')
        json_string = json_string.replace('True', 'true').replace('False', 'false').replace('None', 'null')
        return json.loads(json_string)
    except (json.JSONDecodeError, ValueError):
        pass
    
    try:
        return ast.literal_eval(input_string)
    except (ValueError, SyntaxError):
        pass
    
    try:
        if input_string.strip().startswith('{') and input_string.strip().endswith('}'):
            return eval(input_string)
    except:
        pass

    raise ValueError(f"Cannot parse string as dictionary: {input_string[:100]}...")

### Check Rating Completion

In [None]:
def get_step_info(step):    
    neg_1 = 0
    zero = 0
    pos_1 = 0
    
    for idx, completion in enumerate(step['completions']):
        rating = completion['rating']
        if rating == -1:
            neg_1 += 1
        elif rating == 0:
            zero += 1
        elif rating == 1:
            pos_1 += 1

    human_completion = step['human_completion']

    rating_info = {
        "-1": neg_1,
        "0": zero,
        "1": pos_1,
        "human_completion": human_completion        
    }
    
    return rating_info

### Rating Breakdown

In [None]:
# Input:
"""
[
  {"-1": 1, "0": 1, "1": 1, "human_completion": null},
  {"-1": 1, "0": 1, "1": 1, "human_completion": null},
  {"-1": 1, "0": 0, "1": 0, "human_completion": {
      "text": "No, it shouldn't. The leading term of the product is the product of the leading terms of the factors, and the degree is the sum of the degrees.",
      "rating": null,
      "source": "human",
      "flagged": false,
      "corrected_rating": null
  }},
  {"-1": 0, "0": 0, "1": 1, "human_completion": null},
  {"-1": 1, "0": 0, "1": 1, "human_completion": null},
  {"-1": 0, "0": 0, "1": 1, "human_completion": null},
  {"-1": 1, "0": 1, "1": 1, "human_completion": null}
]
"""

def rating_breakdown(rating_infos):
    steps_neg_1 = []
    total_neg_1_sequence_from_last_step = 0
    steps_zero = []
    steps_pos_1_or_human = []
    total_steps_have_combination = 0
    
    counter = 1
    for step in rating_infos:
        is_neg_1 = step["-1"] > 0
        is_zero = step["0"] > 0
        is_pos_1 = step["1"] > 0 or step["human_completion"] is not None

        if is_neg_1:
            steps_neg_1.append(counter)
        if is_zero:
            steps_zero.append(counter)
        if is_pos_1:
            steps_pos_1_or_human.append(counter)
        if is_neg_1 and is_zero and is_pos_1:
            total_steps_have_combination += 1

        counter += 1
    
    reversed_steps_neg_1 = steps_neg_1[::-1]
    if reversed_steps_neg_1 and reversed_steps_neg_1[0] == len(rating_infos):
        pivot = len(rating_infos)
        for step in reversed_steps_neg_1:
            if step == pivot:
                total_neg_1_sequence_from_last_step += 1
            else:
                break
            pivot -= 1
            
    result = {
        "total_steps_have_neg_1": len(steps_neg_1),
        "steps_neg_1": steps_neg_1,
        "total_neg_1_sequence_from_last_step": total_neg_1_sequence_from_last_step,
        "total_steps_have_zero": len(steps_zero),
        "steps_zero": steps_zero,
        "total_steps_have_pos_1_or_human": len(steps_pos_1_or_human),
        "steps_pos_1_or_human": steps_pos_1_or_human,
        "total_steps_have_combination": total_steps_have_combination
    }

    return result

### Process PRM800K Data

In [None]:
def feature_engineering(prm800k_train):
    results = []
    
    for index, row in prm800k_train.iterrows():
        try:
            data = string_to_dict(row['label'])
            finish_reason = data['finish_reason']
            rating_infos = []
            
            # Process each step
            for step_no, step in enumerate(data['steps']):
                rating_info = get_step_info(step)
                rating_infos.append(rating_info)

            result = rating_breakdown(rating_infos)
            new_result = {
                **row.to_dict(),
                "total_steps": len(rating_infos),
                **result,
                "finish_reason": finish_reason,
            }
            results.append(new_result)
        
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue
    
    # Create DataFrame from results
    results_df = pd.DataFrame(results)
    return results_df

### Run Feature Engineering

In [None]:
df_feature_engineered = feature_engineering(prm800k_test)
df_feature_engineered = df_feature_engineered.sort_values('total_steps_have_combination', ascending=False)
df_feature_engineered.to_csv('datasets/prm800k-test-feature-engineered.csv', index=False)

In [None]:
df_feature_engineered

## EDA Functions

### 1. Get Total Steps

In [None]:
def get_total_steps_sum(df):
    return df['total_steps'].sum()

### 2. Get Finish Reason Counts

In [None]:
def get_finish_reason_counts(df):
    return df['finish_reason'].value_counts().to_dict()

### 3. Filter Only "solution" Rows (Get Valid Data)

In [None]:
def filter_solution_finish_reason(df):
    return df[df['finish_reason'] == 'solution']

### 4. Filter Rows Where Positive Exist on All Steps (Get Valid Data)

In [None]:
def filter_all_steps_exist_pos_or_human(df):
    return df[df['total_steps'] == df['total_steps_have_pos_1_or_human']]

### 5. Total Steps Distribution

In [None]:
import matplotlib.pyplot as plt

def plot_total_steps_distribution(df):
    dist = df['total_steps'].value_counts().sort_index()
    
    plt.figure(figsize=(10, 6))
    plt.bar(dist.index, dist.values, color="#4C72B0", edgecolor="black")
    
    plt.title("Distribution of total_steps", fontsize=14)
    plt.xlabel("Total Steps", fontsize=12)
    plt.ylabel("Number of Rows", fontsize=12)
    plt.xticks(dist.index)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

### 6. Negative Steps Position Distribution

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

def plot_negative_steps_tertile_distribution(df):
    tertile_counter = Counter({"T1": 0, "T2": 0, "T3": 0})

    for _, row in df.iterrows():
        try:
            steps = row['steps_neg_1']
            total_steps = row['total_steps']

            if isinstance(steps, str):
                steps = ast.literal_eval(steps)
            if not isinstance(steps, list) or total_steps == 0:
                continue

            for step in steps:
                percent_pos = (step / total_steps) * 100
                if percent_pos <= 33:
                    tertile_counter["T1"] += 1
                elif percent_pos <= 66:
                    tertile_counter["T2"] += 1
                else:
                    tertile_counter["T3"] += 1

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Prepare data for plotting
    tertile_labels = ["T1 (0–33%)", "T2 (34–66%)", "T3 (67–100%)"]
    counts = [tertile_counter["T1"], tertile_counter["T2"], tertile_counter["T3"]]

    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(tertile_labels, counts, color=["#1f77b4", "#ff7f0e", "#2ca02c"], edgecolor="black")
    plt.title("Tertile Distribution of Negative (-1) Steps", fontsize=14)
    plt.xlabel("Tertile Step Position", fontsize=12)
    plt.ylabel("Number of Times Marked -1", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


### 7. Zero Steps Position Distribution

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

def plot_zero_steps_tertile_distribution(df):
    tertile_counter = Counter({"T1": 0, "T2": 0, "T3": 0})

    for _, row in df.iterrows():
        try:
            steps = row['steps_zero']
            total_steps = row['total_steps']

            if isinstance(steps, str):
                steps = ast.literal_eval(steps)
            if not isinstance(steps, list) or total_steps == 0:
                continue

            for step in steps:
                percent_pos = (step / total_steps) * 100
                if percent_pos <= 33:
                    tertile_counter["T1"] += 1
                elif percent_pos <= 66:
                    tertile_counter["T2"] += 1
                else:
                    tertile_counter["T3"] += 1

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Prepare data for plotting
    tertile_labels = ["T1 (0–33%)", "T2 (34–66%)", "T3 (67–100%)"]
    counts = [tertile_counter["T1"], tertile_counter["T2"], tertile_counter["T3"]]

    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(tertile_labels, counts, color=["#1f77b4", "#ff7f0e", "#2ca02c"], edgecolor="black")
    plt.title("Tertile Distribution of Zero (0) Steps", fontsize=14)
    plt.xlabel("Tertile Step Position", fontsize=12)
    plt.ylabel("Number of Times Marked 0", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


## EDA PRM800K Test Feature Engineered

In [None]:
df_prm800k_test_feature_engineered = prm800k_train = pd.read_csv("datasets/prm800k-test-feature-engineered.csv")

### 1. Get Total Steps

In [None]:
get_total_steps_sum(df_prm800k_test_feature_engineered)

### 2. Get Finish Reason Counts

In [None]:
get_finish_reason_counts(df_prm800k_test_feature_engineered)

### 3. Filter Only "solution" Rows (Get Valid Data)

In [None]:
df_prm800k_test_feature_engineered = filter_solution_finish_reason(df_prm800k_test_feature_engineered)
df_prm800k_test_feature_engineered

In [None]:
get_total_steps_sum(df_prm800k_test_feature_engineered)

### 4. Filter Rows Where Positive Exist on All Steps (Get Valid Data)

In [None]:
df_prm800k_test_feature_engineered = filter_all_steps_exist_pos_or_human(df_prm800k_test_feature_engineered)
df_prm800k_test_feature_engineered

In [None]:
get_total_steps_sum(df_prm800k_test_feature_engineered)

### 5. Total Steps Distribution

In [None]:
plot_total_steps_distribution(df_prm800k_test_feature_engineered)

### 6. Negative Steps Position Distribution

In [None]:
plot_negative_steps_tertile_distribution(df_prm800k_test_feature_engineered)

### 7. Zero Steps Position Distribution

In [None]:
plot_zero_steps_tertile_distribution(df_prm800k_test_feature_engineered)

## Save Valid Data

In [None]:
df_prm800k_test_feature_engineered.to_csv('datasets/prm800k-test-valid-data.csv', index=False)