# Build Sample Dataset

## Setup

In [None]:
# Setup and Imports
import json
import pandas as pd

## Load Datasets

In [None]:
prm800k = pd.read_csv("datasets/prm800k-test-valid-data.csv")
prm800k

## Build Sample Dataset

### Functions

In [None]:
import json
import ast

def string_to_dict(input_string):
    input_string = input_string.strip()

    try:
        json_string = input_string.replace("'", '"')
        json_string = json_string.replace('True', 'true').replace('False', 'false').replace('None', 'null')
        return json.loads(json_string)
    except (json.JSONDecodeError, ValueError):
        pass
    
    try:
        return ast.literal_eval(input_string)
    except (ValueError, SyntaxError):
        pass
    
    try:
        if input_string.strip().startswith('{') and input_string.strip().endswith('}'):
            return eval(input_string)
    except:
        pass

    raise ValueError(f"Cannot parse string as dictionary: {input_string[:100]}...")

In [None]:
def get_ground_truth_steps(steps):
    ground_truth_steps = []
    steps_list = ast.literal_eval(steps) if isinstance(steps, str) else steps
    for step in steps_list:
        if step['human_completion'] != None:
            ground_truth_steps.append(step['human_completion']['text'])
        else:
            for completion in step['completions']: 
                if completion['rating'] == 1:
                    ground_truth_steps.append(completion['text']) 
                    break  # Break after finding first rating=1 completion
    return ground_truth_steps

In [None]:
def process_data(df, function):
    results = []
    counter = 1
    for index, row in df.iterrows():
        try:
            question = string_to_dict(row['question'])
            data = string_to_dict(row['label'])
            steps = data['steps']
            new_columns = function(steps)

            new_result = {
                "id": counter,
                "labeler": row["labeler"],
                "timestamp": row["timestamp"],
                "problem": question["problem"],
                "ground_truth_answer": question["ground_truth_answer"],
                "total_steps": row["total_steps"],
                "ground_truth_steps": get_ground_truth_steps(steps),
                **new_columns,
            }
            results.append(new_result)
            counter += 1
        
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue
    
    # Create DataFrame from results
    results_df = pd.DataFrame(results)
    return results_df

In [None]:
def get_steps_info(steps):
    neg_1 = 0
    zero = 0
    pos_1 = 0

    for step in steps:
        match step["rating"]:
            case -1:
                neg_1 += 1
            case 0:
                zero += 1
            case 1:
                pos_1 += 1
    
    steps_info = {
        "neg_1": neg_1,
        "zero": zero,
        "pos_1": pos_1
    }

    return steps_info

In [None]:
def get_total_steps_sum_each_rating(df):
    result = {
        "total_neg_1": df['neg_1'].sum(),
        "total_zero": df['zero'].sum(),
        "total_pos_1": df['pos_1'].sum()
    }
    return result

In [None]:
def drop_rows_where_pos1_equals_total_steps(df):
    return df[df["pos_1"] != df["total_steps"]].copy()

### [ALGO 1] Merge Completions in Each Steps to Index 0

In [None]:
def merge_completions_to_idx_0(steps):
    new_steps = []

    # ALGO: get first completion in every steps
    for step_no, step in enumerate(steps):
        collect_step = step['completions'][0]
        new_steps.append(collect_step)

    steps_info = get_steps_info(new_steps)
    result = {
        "steps": new_steps,
        **steps_info
    }
    return result

In [None]:
df_option_1 = process_data(prm800k, merge_completions_to_idx_0)
df_option_1.to_csv("datasets/sample/prm800k-01-first-completion.csv", index=False)

In [None]:
df_option_1

In [None]:
# Calculate Total neg_1, zero, pos_1
get_total_steps_sum_each_rating(df_option_1)

Result: Still Not Good, Total Pos 1 Too Much, Many of Datas Only Pos 1 No Neg 1 & Zero

### [ALGO 2] Real Condition - For Early Step (<30%) Use 1, Then After It Random, If Found -1 The Use -1/0 After It

In [None]:
import random

def algo_2(steps):
    new_steps = []
    t1 = len(steps) / 3

    # ALGO: get first completion in every steps
    counter = 1
    latest_step = 1
    for step_no, step in enumerate(steps):
        collect_step = step['completions'][0]
        rating = collect_step["rating"]

        if counter <= t1:
            # CASE <= 33%
            if rating != 1:
                flag = False
                for idx, completion in enumerate(step['completions']):
                    if completion["rating"] == 1:
                        collect_step = completion
                        flag = True
                        break
                if not flag:
                    collect_step = step["human_completion"]
                    collect_step["rating"] = 1
            new_steps.append(collect_step)
            counter += 1
            continue
        
        # After 33%
        if latest_step == -1:
            for idx, completion in enumerate(step['completions']):
                if completion["rating"] == -1:
                    collect_step = completion
                    break
        
            # Berusaha -1, kalau ngga ketemu pakai default (first completion)
            new_steps.append(collect_step)
            continue
        
        total_completions = len(step['completions'])
        random_completion = step['completions'][random.randint(0, total_completions-1)]
        new_steps.append(random_completion)
        latest_step = random_completion['rating']

    steps_info = get_steps_info(new_steps)
    result = {
        "steps": new_steps,
        **steps_info
    }
    return result

In [None]:
df_option_2 = process_data(prm800k, algo_2)
df_option_2.to_csv("datasets/sample/prm800k-02-algo2.csv", index=False)

In [None]:
df_option_2

In [None]:
# Calculate Total neg_1, zero, pos_1
get_total_steps_sum_each_rating(df_option_2)

Result: Still Not Good, Total Pos 1 Too Much, Many of Datas Only Pos 1 No Neg 1 & Zero

In [None]:
# Try To Drop All Pos 1
clean_df_option_2 = drop_rows_where_pos1_equals_total_steps(df_option_2)

In [None]:
clean_df_option_2

In [None]:
get_total_steps_sum_each_rating(clean_df_option_2)

Result: Good, Natural PRM800K
| Label | Jumlah | Persentase |
| ----- | ------ | ---------- |
| -1    | 211    | \~17.3%    |
| 0     | 54     | \~4.4%     |
| +1    | 955    | \~78.3%    |

In [None]:
clean_df_option_2.to_csv("datasets/sample/prm800k-02-algo2-clean.csv", index=False)

### [ALGO 3] Real Condition - Random Rating From Early, If Found -1 The Use -1/0 After It

In [None]:
import random

def algo_3(steps):
    new_steps = []

    latest_step = 1
    for step_no, step in enumerate(steps):
        collect_step = step['completions'][0]

        # From Early
        if latest_step == -1:
            for idx, completion in enumerate(step['completions']):
                if completion["rating"] == -1:
                    collect_step = completion
                    break
        
            # Berusaha -1, kalau ngga ketemu pakai default (first completion)
            new_steps.append(collect_step)
            continue
        
        total_completions = len(step['completions'])
        random_completion = step['completions'][random.randint(0, total_completions-1)]
        new_steps.append(random_completion)
        latest_step = random_completion['rating']

    steps_info = get_steps_info(new_steps)
    result = {
        "steps": new_steps,
        **steps_info
    }
    return result

In [None]:
df_option_3 = process_data(prm800k, algo_3)
df_option_3.to_csv("datasets/sample/prm800k-02-algo3.csv", index=False)

In [None]:
df_option_3

In [None]:
# Try To Drop All Pos 1
clean_df_option_3 = drop_rows_where_pos1_equals_total_steps(df_option_3)

In [None]:
clean_df_option_3

In [None]:
get_total_steps_sum_each_rating(clean_df_option_3)

In [None]:
clean_df_option_3.to_csv("datasets/sample/prm800k-03-algo3-clean.csv", index=False)