In [1]:
import pandas as pd
from datasets import load_dataset


In [3]:
test_name = "test_500"
final_dataset = load_dataset(f"dbaeka/soen_691_{test_name}_meta_set")['test']

In [4]:
df_full = final_dataset.to_pandas()

# Create BERT score bins
quantiles = df_full["bert_score"].quantile([0.33, 0.66])
low_thresh, high_thresh = quantiles[0.33], quantiles[0.66]

In [5]:
def assign_quality(score):
    if score <= low_thresh:
        return "low"
    elif score <= high_thresh:
        return "medium"
    else:
        return "high"

In [6]:
df_full["bert_quality"] = df_full["bert_score"].apply(assign_quality)
df_full.head()

Unnamed: 0,hash,gold,pred,cot,lang,summary,bert_score,patch,bert_quality
0,033b0baff52bb483,"If there are no manifests, then entries should...",The change gracefully handles the case where a...,This code change is addressing the case where ...,java,Collect all data files in the manifest.,0.849598,"@@ -191,7 +191,10 @@ public class FindFiles {\...",medium
1,c1752542d5b833d3,Didn't went with backwards compatibility since...,The log message should include which collector...,Let me analyze the code change:\n\n1. The meth...,.cs,FailedToReachCollector - Method failed to reac...,0.830813,"@@ -26,11 +26,12 @@ namespace OpenTelemetry.Ex...",low
2,7dc3d2474c83cca7,Is this actually a lint related change?,The percent sign doesn't need to be escaped in...,"In this code diff, a regex pattern is being mo...",js,Creates all inputs of a block and removes all ...,0.85543,"@@ -185,7 +185,7 @@ Blockly.ScratchBlocks.Proc...",medium
3,973d5f8f3c42730f,I don't know if there are strict conventions a...,The message doesn't explain why sys.exit() is ...,I'm looking at a code review request for an ad...,py,Uses the dict constructor a list of transient ...,0.840675,"@@ -273,6 +273,11 @@ class RefactoringChecker(...",low
4,9ef6bf7111827712,what to do if some failed? just logging?,The error message should provide more context ...,The given code shows a change where a `CHECK_E...,cpp,Clean expired hosts. expired hosts to offline ...,0.851026,"@@ -146,7 +146,9 @@ void ActiveHostsMan::clean...",medium


In [7]:
lang_counts = df_full["lang"].value_counts()
print("Language distribution in original dataset:")
print(lang_counts)

Language distribution in original dataset:
lang
go      126
java     83
py       69
rb       49
js       47
cpp      41
.cs      40
php      24
c        21
Name: count, dtype: int64


In [8]:
NUM_EVALUATORS = 5
NUM_OVERLAP_ITEMS = 60  # Items rated by ALL evaluators
NUM_UNIQUE_ITEMS_PER_EVALUATOR = 48 # Items rated by only ONE evaluator
SAMPLES_PER_EVALUATOR = NUM_OVERLAP_ITEMS + NUM_UNIQUE_ITEMS_PER_EVALUATOR # 108

TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS = NUM_UNIQUE_ITEMS_PER_EVALUATOR * NUM_EVALUATORS # 48 * 5 = 240
TOTAL_UNIQUE_ITEMS_NEEDED = NUM_OVERLAP_ITEMS + TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS # 60 + 240 = 300
RANDOM_SEED = 0

LANGUAGES_TO_INCLUDE = ["go", "java", "py", "js", "php"]

import numpy as np
import random

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [9]:
if LANGUAGES_TO_INCLUDE:
    original_length = len(df_full)
    df_full = df_full[df_full["lang"].isin(LANGUAGES_TO_INCLUDE)]
    print(f"Filtered from {original_length} to {len(df_full)} samples based on selected languages.")

Filtered from 500 to 349 samples based on selected languages.


In [10]:
print(f"\nSampling {TOTAL_UNIQUE_ITEMS_NEEDED} unique items...")
# Define stratification columns (can adjust if bert_quality wasn't assigned)
stratify_cols = ['lang']
if 'bert_quality' in df_full.columns and df_full["bert_quality"].nunique() > 1:
     stratify_cols.append('bert_quality')
     print(f"Stratifying sample by: {stratify_cols}")
else:
     print(f"Stratifying sample by: {stratify_cols} ('bert_quality' not used for stratification)")

# Check if enough data available
if len(df_full) < TOTAL_UNIQUE_ITEMS_NEEDED:
    print(f"Error: Not enough items in the filtered dataset ({len(df_full)}) to sample {TOTAL_UNIQUE_ITEMS_NEEDED} unique items.")
    exit()


Sampling 300 unique items...
Stratifying sample by: ['lang', 'bert_quality']


In [11]:
# Perform stratified sampling
df_full['stratify_key'] = df_full[stratify_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# Calculate how many to sample from each group proportionally
n_total = len(df_full)
n_sample = TOTAL_UNIQUE_ITEMS_NEEDED
sampled_indices = df_full.groupby('stratify_key', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), max(1, int(np.ceil(len(x) * n_sample / n_total)))), random_state=RANDOM_SEED)
).index

# Adjust sample size if proportional sampling didn't yield exactly n_sample
current_sample_size = len(sampled_indices)
if current_sample_size < n_sample:
    print(f"Adjusting sample size: needed {n_sample}, got {current_sample_size} proportionally.")
    remaining_indices = df_full.index.difference(sampled_indices)
    additional_samples = np.random.choice(remaining_indices, n_sample - current_sample_size, replace=False)
    sampled_indices = sampled_indices.union(pd.Index(additional_samples))
elif current_sample_size > n_sample:
     print(f"Adjusting sample size: needed {n_sample}, got {current_sample_size} proportionally.")
     sampled_indices = np.random.choice(sampled_indices, n_sample, replace=False)


df_sampled = df_full.loc[sampled_indices].copy()
# Drop the temporary key
df_sampled = df_sampled.drop(columns=['stratify_key'])
df_full = df_full.drop(columns=['stratify_key']) # Drop from original too


print(f"Sampled {len(df_sampled)} unique items.")

Adjusting sample size: needed 300, got 306 proportionally.
Sampled 300 unique items.


  sampled_indices = df_full.groupby('stratify_key', group_keys=False).apply(


In [12]:
from sklearn.model_selection import train_test_split

print(f"\nSplitting sampled items into {NUM_OVERLAP_ITEMS} overlap and {TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS} unique items...")

# Use stratification for the split as well
try:
    # Ensure stratification column exists and has variability
    stratify_col_split = 'lang' # Default to lang
    if 'bert_quality' in df_sampled.columns and df_sampled['bert_quality'].nunique() > 1:
        stratify_col_split = 'bert_quality' # Use quality if available

    df_overlap, df_unique_pool = train_test_split(
        df_sampled,
        test_size=TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS, # Size of the unique pool
        train_size=NUM_OVERLAP_ITEMS,            # Size of the overlap pool
        random_state=RANDOM_SEED,
        stratify=df_sampled[[stratify_col_split]] # Stratify the split
    )
    print(f"Split complete: {len(df_overlap)} overlap items, {len(df_unique_pool)} unique pool items.")
except ValueError as e:
     print(f"Warning: Could not stratify split ({e}). Performing random split.")
     df_overlap = df_sampled.sample(n=NUM_OVERLAP_ITEMS, random_state=RANDOM_SEED)
     df_unique_pool = df_sampled.drop(df_overlap.index)


Splitting sampled items into 60 overlap and 240 unique items...
Split complete: 60 overlap items, 240 unique pool items.


In [13]:
print("\nAssigning items to evaluators...")
evaluator_assignments = {i: [] for i in range(NUM_EVALUATORS)} 

overlap_items_list = df_overlap.to_dict('records')
for item_data in overlap_items_list:
    for i in range(NUM_EVALUATORS):
        evaluator_assignments[i].append(item_data)
print(f"Assigned {len(df_overlap)} overlap items to all {NUM_EVALUATORS} evaluators.")

unique_items_list = df_unique_pool.to_dict('records')
random.shuffle(unique_items_list)

items_per_evaluator_unique = TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS // NUM_EVALUATORS
if TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS % NUM_EVALUATORS != 0:
     print(f"Warning: Unique items ({TOTAL_UNIQUE_ITEMS_FOR_UNIQUE_SETS}) not perfectly divisible by evaluators ({NUM_EVALUATORS}). Distribution might be slightly uneven.")

start_idx = 0
for i in range(NUM_EVALUATORS):
    if i == NUM_EVALUATORS - 1:
         end_idx = len(unique_items_list)
    else:
         end_idx = start_idx + items_per_evaluator_unique

    unique_slice = unique_items_list[start_idx:end_idx]
    evaluator_assignments[i].extend(unique_slice)
    print(f"Assigned {len(unique_slice)} unique items to evaluator {i+1}.")
    start_idx = end_idx


Assigning items to evaluators...
Assigned 60 overlap items to all 5 evaluators.
Assigned 48 unique items to evaluator 1.
Assigned 48 unique items to evaluator 2.
Assigned 48 unique items to evaluator 3.
Assigned 48 unique items to evaluator 4.
Assigned 48 unique items to evaluator 5.


In [18]:
print("\nVerifying final assignment counts...")
all_assigned_data = []
for evaluator_id_zero_based, assigned_items in evaluator_assignments.items():
    evaluator_id_one_based = evaluator_id_zero_based + 1
    num_assigned = len(assigned_items)
    print(f"Evaluator {evaluator_id_one_based} has {num_assigned} items assigned.")
    if num_assigned != SAMPLES_PER_EVALUATOR:
        print(f"  WARNING: Evaluator {evaluator_id_one_based} has {num_assigned} items, expected {SAMPLES_PER_EVALUATOR}.")

    for item_dict in assigned_items:
        item_dict_copy = item_dict.copy()
        item_dict_copy['evaluator_id'] = evaluator_id_one_based
        all_assigned_data.append(item_dict_copy)

df_final_assignments = pd.DataFrame(all_assigned_data)

# Optional: Shuffle the final combined list so items aren't grouped by overlap/unique status
df_final_assignments = df_final_assignments.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"\nTotal assignments generated: {len(df_final_assignments)}")

df_final_assignments.to_csv("combined_evaluator_assignments.csv")
print(f"Combined assignments saved to: combined_evaluator_assignments")

print("\nSample of final assignment data:")
df_final_assignments.head()


Verifying final assignment counts...
Evaluator 1 has 108 items assigned.
Evaluator 2 has 108 items assigned.
Evaluator 3 has 108 items assigned.
Evaluator 4 has 108 items assigned.
Evaluator 5 has 108 items assigned.

Total assignments generated: 540
Combined assignments saved to: combined_evaluator_assignments

Sample of final assignment data:


Unnamed: 0,hash,gold,pred,cot,lang,summary,bert_score,patch,bert_quality,evaluator_id
0,c0782327891b2a05,All the code that reference to this function m...,The method name change better reflects its pur...,I need to provide a formal code review for the...,go,validateOptions validates the options for the ...,0.846068,"@@ -68,8 +68,8 @@ func (c *cstorSnapshotComman...",medium,1
1,f250d880ef09208a,"Ideally, we'd fold the schema into an expanded...",The content may exceed Elasticsearch's maximum...,Let me analyze the code change:\n\n1. The orig...,py,get the byte contents of a file.,0.864433,"@@ -87,7 +87,8 @@ def get_contents(bucket, key...",high,4
2,4dbb6143e70527de,@kevinansfield Would be cool if you can take a...,"Is 'member-subscription' a custom transform, a...",I need to give a formal code review for this d...,js,Default Model extend method.,0.841333,"@@ -4,5 +4,6 @@ import attr from 'ember-data/a...",low,5
3,72736b53cb9c9ae6,And again. The reason it's bad is that if some...,Avoid using Sun's proprietary API as they're n...,I'm reviewing a diff that shows an added impor...,js,Reads a single non - null from the System. T...,0.856422,"@@ -19,6 +19,8 @@ package org.openqa.grid.web;...",medium,3
4,e4f230a58d1c5806,It's very error-prone to have so large `try` c...,The variable `returned_results` was removed bu...,This code fixes an exception handling block. P...,py,Post process a DataFrame with a set of logical...,0.837403,"@@ -81,10 +81,10 @@ def file_list_to_folder(df...",low,2
