# EDA Sample Dataset

In [None]:
# Setup and Imports
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as scp
# import plotly.express as px
import matplotlib.pyplot as plt
from numpy.polynomial.polynomial import polyfit

## Load Datasets

In [None]:
sample = pd.read_csv("datasets/sample/prm800k-03-algo3-clean.csv")
sample

## Sample Fields

| Column                | Description                                |
| --------------------- | ------------------------------------------ |
| `labeler`             | Unique ID of the annotator                 |
| `timestamp`           | Annotation timestamp                       |
| `problem`             | Problem statement (usually mathematical)   |
| `ground_truth_answer` | Correct answer in LaTeX or vector notation |
| `total_steps`         | Total number of reasoning steps            |
| `steps`               | List of reasoning steps with ratings       |
| `neg_1`               | Count of steps with rating -1              |
| `zero`                | Count of steps with rating 0               |
| `pos_1`               | Count of steps with rating +1              |

The 'steps' field contains a list of reasoning steps. Each steps follows this structure:
```
{
  {
    "text": "Some reasoning text...",
    "rating": 1,
    "flagged": false
  },
  ...
}
```
Fields Explained
- text: The reasoning text generated by a model or human.
- rating: Quality label of the reasoning:
```
1: correct
0: redundant
-1: incorrect
```

## EDA

### 1. Get Total Steps Sum Each Rating

In [None]:
def get_total_steps_sum_each_rating(df):
    total_steps = int(df['total_steps'].sum())
    total_neg_1 = int(df['neg_1'].sum())
    total_zero = int(df['zero'].sum())
    total_pos_1 = int(df['pos_1'].sum())

    percentage_neg_1 = round((total_neg_1 / total_steps) * 100, 2) if total_steps else 0.0
    percentage_zero = round((total_zero / total_steps) * 100, 2) if total_steps else 0.0
    percentage_pos_1 = round((total_pos_1 / total_steps) * 100, 2) if total_steps else 0.0

    print(f"{'Rating':<12} {'Total':>10} {'Percentage':>15}")
    print("=" * 40)
    print(f"{'-1':<12} {total_neg_1:>10} {percentage_neg_1:>14.2f}%")
    print(f"{'0':<12} {total_zero:>10} {percentage_zero:>14.2f}%")
    print(f"{'+1':<12} {total_pos_1:>10} {percentage_pos_1:>14.2f}%")
    print("=" * 40)
    print(f"{'Total Steps':<12} {total_steps:>10}")

In [None]:
get_total_steps_sum_each_rating(sample)

### 2. Total Steps Distribution

In [None]:
import matplotlib.pyplot as plt

def plot_total_steps_distribution(df):
    dist = df['total_steps'].value_counts().sort_index()
    
    plt.figure(figsize=(10, 6))
    plt.bar(dist.index, dist.values, color="#4C72B0", edgecolor="black")
    
    plt.title("Distribution of total_steps", fontsize=14)
    plt.xlabel("Total Steps", fontsize=12)
    plt.ylabel("Number of Rows", fontsize=12)
    plt.xticks(dist.index)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

In [None]:
plot_total_steps_distribution(sample)

### 3. Negative Steps Position Distribution

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

def plot_negative_steps_tertile_distribution(df):
    tertile_counter = Counter({"T1": 0, "T2": 0, "T3": 0})

    for _, row in df.iterrows():
        try:
            steps_list = ast.literal_eval(row['steps'])
            steps = []
            counter = 1
            for step in steps_list:
                if step['rating'] == -1:
                    steps.append(counter)
                counter += 1

            total_steps = row['total_steps']

            if isinstance(steps, str):
                steps = ast.literal_eval(steps)
            if not isinstance(steps, list) or total_steps == 0:
                continue

            for step in steps:
                percent_pos = (step / total_steps) * 100
                if percent_pos <= 33:
                    tertile_counter["T1"] += 1
                elif percent_pos <= 66:
                    tertile_counter["T2"] += 1
                else:
                    tertile_counter["T3"] += 1

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Prepare data for plotting
    tertile_labels = ["T1 (0–33%)", "T2 (34–66%)", "T3 (67–100%)"]
    counts = [tertile_counter["T1"], tertile_counter["T2"], tertile_counter["T3"]]

    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(tertile_labels, counts, color=["#1f77b4", "#ff7f0e", "#2ca02c"], edgecolor="black")
    plt.title("Tertile Distribution of Negative (-1) Steps", fontsize=14)
    plt.xlabel("Tertile Step Position", fontsize=12)
    plt.ylabel("Number of Times Marked -1", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


In [None]:
plot_negative_steps_tertile_distribution(sample)

### 4. Zero Steps Position Distribution

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

def plot_zero_steps_tertile_distribution(df):
    tertile_counter = Counter({"T1": 0, "T2": 0, "T3": 0})

    for _, row in df.iterrows():
        try:
            steps_list = ast.literal_eval(row['steps'])
            steps = []
            counter = 1
            for step in steps_list:
                if step['rating'] == 0:
                    steps.append(counter)
                counter += 1

            total_steps = row['total_steps']

            if isinstance(steps, str):
                steps = ast.literal_eval(steps)
            if not isinstance(steps, list) or total_steps == 0:
                continue

            for step in steps:
                percent_pos = (step / total_steps) * 100
                if percent_pos <= 33:
                    tertile_counter["T1"] += 1
                elif percent_pos <= 66:
                    tertile_counter["T2"] += 1
                else:
                    tertile_counter["T3"] += 1

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Prepare data for plotting
    tertile_labels = ["T1 (0–33%)", "T2 (34–66%)", "T3 (67–100%)"]
    counts = [tertile_counter["T1"], tertile_counter["T2"], tertile_counter["T3"]]

    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(tertile_labels, counts, color=["#1f77b4", "#ff7f0e", "#2ca02c"], edgecolor="black")
    plt.title("Tertile Distribution of Zero (0) Steps", fontsize=14)
    plt.xlabel("Tertile Step Position", fontsize=12)
    plt.ylabel("Number of Times Marked 0", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


In [None]:
plot_zero_steps_tertile_distribution(sample)

### 5. Positive Steps Position Distribution

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import ast

def plot_positive_steps_tertile_distribution(df):
    tertile_counter = Counter({"T1": 0, "T2": 0, "T3": 0})

    for _, row in df.iterrows():
        try:
            steps_list = ast.literal_eval(row['steps'])
            steps = []
            counter = 1
            for step in steps_list:
                if step['rating'] == 1:
                    steps.append(counter)
                counter += 1

            total_steps = row['total_steps']

            if isinstance(steps, str):
                steps = ast.literal_eval(steps)
            if not isinstance(steps, list) or total_steps == 0:
                continue

            for step in steps:
                percent_pos = (step / total_steps) * 100
                if percent_pos <= 33:
                    tertile_counter["T1"] += 1
                elif percent_pos <= 66:
                    tertile_counter["T2"] += 1
                else:
                    tertile_counter["T3"] += 1

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Prepare data for plotting
    tertile_labels = ["T1 (0–33%)", "T2 (34–66%)", "T3 (67–100%)"]
    counts = [tertile_counter["T1"], tertile_counter["T2"], tertile_counter["T3"]]

    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(tertile_labels, counts, color=["#1f77b4", "#ff7f0e", "#2ca02c"], edgecolor="black")
    plt.title("Tertile Distribution of Positive (1) Steps", fontsize=14)
    plt.xlabel("Tertile Step Position", fontsize=12)
    plt.ylabel("Number of Times Marked 1", fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()


In [None]:
plot_positive_steps_tertile_distribution(sample)