In [61]:
import pandas as pd
from datasets import load_dataset


In [62]:
test_name = "test_500"
final_dataset = load_dataset(f"dbaeka/soen_691_{test_name}_meta_set")['test']

In [63]:
df = final_dataset.to_pandas()

# Create BERT score bins
quantiles = df["bert_score"].quantile([0.33, 0.66])
low_thresh, high_thresh = quantiles[0.33], quantiles[0.66]

In [64]:
def assign_quality(score):
    if score <= low_thresh:
        return "low"
    elif score <= high_thresh:
        return "medium"
    else:
        return "high"

In [65]:
df["bert_quality"] = df["bert_score"].apply(assign_quality)
df.head()

Unnamed: 0,hash,gold,pred,cot,lang,summary,bert_score,patch,bert_quality
0,033b0baff52bb483,"If there are no manifests, then entries should...",The change gracefully handles the case where a...,This code change is addressing the case where ...,java,Collect all data files in the manifest.,0.849598,"@@ -191,7 +191,10 @@ public class FindFiles {\...",medium
1,c1752542d5b833d3,Didn't went with backwards compatibility since...,The log message should include which collector...,Let me analyze the code change:\n\n1. The meth...,.cs,FailedToReachCollector - Method failed to reac...,0.830813,"@@ -26,11 +26,12 @@ namespace OpenTelemetry.Ex...",low
2,7dc3d2474c83cca7,Is this actually a lint related change?,The percent sign doesn't need to be escaped in...,"In this code diff, a regex pattern is being mo...",js,Creates all inputs of a block and removes all ...,0.85543,"@@ -185,7 +185,7 @@ Blockly.ScratchBlocks.Proc...",medium
3,973d5f8f3c42730f,I don't know if there are strict conventions a...,The message doesn't explain why sys.exit() is ...,I'm looking at a code review request for an ad...,py,Uses the dict constructor a list of transient ...,0.840675,"@@ -273,6 +273,11 @@ class RefactoringChecker(...",low
4,9ef6bf7111827712,what to do if some failed? just logging?,The error message should provide more context ...,The given code shows a change where a `CHECK_E...,cpp,Clean expired hosts. expired hosts to offline ...,0.851026,"@@ -146,7 +146,9 @@ void ActiveHostsMan::clean...",medium


In [66]:
lang_counts = df["lang"].value_counts()
print("Language distribution in original dataset:")
print(lang_counts)

Language distribution in original dataset:
lang
go      126
java     83
py       69
rb       49
js       47
cpp      41
.cs      40
php      24
c        21
Name: count, dtype: int64


In [67]:
NUM_EVALUATORS = 5
SAMPLES_PER_EVALUATOR = 60
TOTAL_ASSIGNMENTS = NUM_EVALUATORS * SAMPLES_PER_EVALUATOR  # 300
UNIQUE_SAMPLES_NEEDED = TOTAL_ASSIGNMENTS // 2  # Each sample appears twice

RANDOM_SEED = 0

LANGUAGES_TO_INCLUDE = ["go", "java", "py", "js", "php"]

import numpy as np
import random

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [68]:
if LANGUAGES_TO_INCLUDE:
    original_length = len(df)
    df = df[df["lang"].isin(LANGUAGES_TO_INCLUDE)]
    print(f"Filtered from {original_length} to {len(df)} samples based on selected languages.")

    # Check if we have enough samples after filtering
    if len(df) < UNIQUE_SAMPLES_NEEDED:
        print(f"WARNING: Only {len(df)} samples available after language filtering. "
              f"Need {UNIQUE_SAMPLES_NEEDED} unique samples.")

Filtered from 500 to 349 samples based on selected languages.


In [69]:
lang_counts = df["lang"].value_counts()
print("Language distribution in filtered dataset:")
print(lang_counts)

Language distribution in filtered dataset:
lang
go      126
java     83
py       69
js       47
php      24
Name: count, dtype: int64


In [70]:
# Calculate samples needed per combination of language and quality
quality_categories = ["low", "medium", "high"]
languages = lang_counts.index.tolist()

In [71]:
# Create stratified sampling targets
total_samples = len(df)
strata_targets = {}

# Equal distribution across languages
if len(languages) > 0:
    samples_per_language = UNIQUE_SAMPLES_NEEDED // len(languages)
    remainder = UNIQUE_SAMPLES_NEEDED % len(languages)

    # Distribute samples across languages and quality levels
    for i, lang in enumerate(languages):
        # Add extra samples to first few languages if needed
        lang_samples = samples_per_language + (1 if i < remainder else 0)
        samples_per_quality = lang_samples // len(quality_categories)
        quality_remainder = lang_samples % len(quality_categories)

        for j, quality in enumerate(quality_categories):
            # Add extra samples to first quality categories if needed
            target = samples_per_quality + (1 if j < quality_remainder else 0)

            # Count examples in each combination
            count_in_strata = len(df[(df["lang"] == lang) & (df["bert_quality"] == quality)])

            # Adjust if we don't have enough samples in this combination
            if target > count_in_strata:
                print(f"Warning: Not enough samples for lang={lang}, quality={quality}. "
                      f"Requested {target}, but only have {count_in_strata}.")
                target = count_in_strata

            strata_targets[(lang, quality)] = target



In [72]:
print("\nSampling targets by language and quality:")
for (lang, quality), target in strata_targets.items():
    print(f"{lang}-{quality}: {target}")


Sampling targets by language and quality:
go-low: 10
go-medium: 10
go-high: 10
java-low: 10
java-medium: 10
java-high: 10
py-low: 10
py-medium: 10
py-high: 10
js-low: 10
js-medium: 10
js-high: 10
php-low: 10
php-medium: 9
php-high: 5


In [73]:
# Sample from each stratum
stratified_samples = []

for (lang, quality), target in strata_targets.items():
    if target > 0:
        stratum = df[(df["lang"] == lang) & (df["bert_quality"] == quality)]
        # Sample with replacement if necessary
        replace = len(stratum) < target
        sampled = stratum.sample(target, random_state=RANDOM_SEED, replace=replace)
        stratified_samples.append(sampled)

samples = pd.concat(stratified_samples).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [74]:
# If we're short on samples after stratified sampling, get additional samples from the filtered dataset
if len(samples) < UNIQUE_SAMPLES_NEEDED:
    print(f"Warning: Only found {len(samples)} unique samples out of {UNIQUE_SAMPLES_NEEDED} needed.")
    additional_needed = UNIQUE_SAMPLES_NEEDED - len(samples)
    
    # Get samples that haven't been selected yet
    already_selected = set(samples.index)
    remaining_samples = df[~df.index.isin(already_selected)]
    
    # If we still have samples available in the filtered dataset
    if len(remaining_samples) > 0:
        # Take as many additional samples as possible from remaining data
        samples_to_add = min(len(remaining_samples), additional_needed)
        additional_samples = remaining_samples.sample(samples_to_add, random_state=RANDOM_SEED)
        
        print(f"Adding {len(additional_samples)} additional unique samples from the filtered dataset.")
        samples = pd.concat([samples, additional_samples]).reset_index(drop=True)
        
        remaining_needed = additional_needed - samples_to_add
        # If we still need more samples, then we finally resort to repeating
        if remaining_needed > 0:
            print(f"Still need {remaining_needed} more samples. Will repeat some existing samples.")
            repeated_samples = samples.sample(remaining_needed, replace=True, random_state=RANDOM_SEED+1)
            samples = pd.concat([samples, repeated_samples]).reset_index(drop=True)
    else:
        # If no remaining samples, we have to repeat
        print(f"No remaining unique samples in filtered dataset. Will repeat {additional_needed} existing samples.")
        additional_samples = samples.sample(additional_needed, replace=True, random_state=RANDOM_SEED)
        samples = pd.concat([samples, additional_samples]).reset_index(drop=True)

Adding 6 additional unique samples from the filtered dataset.


In [75]:
# Check our resulting sample distribution
print("\nStratified sample distribution:")
print(f"Total samples: {len(samples)}")
print("\nBy language:")
print(samples["lang"].value_counts())
print("\nBy quality:")
print(samples["bert_quality"].value_counts())
print("\nBy language and quality:")
print(pd.crosstab(samples["lang"], samples["bert_quality"]))


Stratified sample distribution:
Total samples: 150

By language:
lang
go      32
py      32
java    31
js      30
php     25
Name: count, dtype: int64

By quality:
bert_quality
medium    52
low       51
high      47
Name: count, dtype: int64

By language and quality:
bert_quality  high  low  medium
lang                           
go              10   11      11
java            11   10      10
js              10   10      10
php              5   10      10
py              11   10      11


In [76]:
from collections import defaultdict
from itertools import combinations

# Assign samples to evaluator pairs
evaluator_pairs = list(combinations(range(NUM_EVALUATORS), 2))
evaluator_assignments = {e: 0 for e in range(NUM_EVALUATORS)}
evaluator_data = defaultdict(list)
sample_to_pair = {}

# First pass: Assign each sample to a pair of evaluators
for idx in samples.index:
    # Choose the evaluator pair with the fewest current assignments
    pair_loads = [(p, evaluator_assignments[p[0]] + evaluator_assignments[p[1]]) for p in evaluator_pairs]
    pair_loads.sort(key=lambda x: x[1])  # Sort by load
    eval1, eval2 = pair_loads[0][0]  # Get the pair with lowest load

    # Update assignments
    row = samples.loc[idx]
    evaluator_data[eval1].append(row)
    evaluator_data[eval2].append(row)
    evaluator_assignments[eval1] += 1
    evaluator_assignments[eval2] += 1
    sample_to_pair[idx] = (eval1, eval2)

# Check if we've got the right number of samples per evaluator
for e in range(NUM_EVALUATORS):
    print(f"Evaluator {e + 1} has {evaluator_assignments[e]} samples")

# Balance assignments if needed by redistributing samples
MAX_ITERATIONS = 100
iteration = 0

while (max(evaluator_assignments.values()) - min(evaluator_assignments.values()) > 1 and
       iteration < MAX_ITERATIONS):
    # Find evaluator with most and least assignments
    max_eval = max(evaluator_assignments, key=evaluator_assignments.get)
    min_eval = min(evaluator_assignments, key=evaluator_assignments.get)

    if evaluator_assignments[max_eval] - evaluator_assignments[min_eval] <= 1:
        break

    # Find a sample that can be reassigned
    for idx, (eval1, eval2) in sample_to_pair.items():
        if eval1 == max_eval and eval2 != min_eval:
            # Reassign from eval1 to min_eval
            sample_to_pair[idx] = (min_eval, eval2)
            row = samples.loc[idx]
            evaluator_data[eval1].remove(row)
            evaluator_data[min_eval].append(row)
            evaluator_assignments[eval1] -= 1
            evaluator_assignments[min_eval] += 1
            break
        elif eval2 == max_eval and eval1 != min_eval:
            # Reassign from eval2 to min_eval
            sample_to_pair[idx] = (eval1, min_eval)
            row = samples.loc[idx]
            evaluator_data[eval2].remove(row)
            evaluator_data[min_eval].append(row)
            evaluator_assignments[eval2] -= 1
            evaluator_assignments[min_eval] += 1
            break

    iteration += 1

Evaluator 1 has 60 samples
Evaluator 2 has 60 samples
Evaluator 3 has 60 samples
Evaluator 4 has 60 samples
Evaluator 5 has 60 samples


In [77]:
# Final verification
for e in range(NUM_EVALUATORS):
    assert len(evaluator_data[
                   e]) == SAMPLES_PER_EVALUATOR, f"Evaluator {e + 1} has {len(evaluator_data[e])} samples instead of {SAMPLES_PER_EVALUATOR}"


In [90]:
# Create a combined dataset instead of individual files
all_evaluator_data = []

for evaluator, rows in evaluator_data.items():
    df_eval = pd.DataFrame(rows).reset_index(drop=True)
    df_eval["usefulness"] = ""
    df_eval["correctness"] = ""
    df_eval["explanation_quality"] = ""
    df_eval["evaluator_id"] = evaluator + 1  # Add evaluator ID (1-indexed)
    all_evaluator_data.append(df_eval)

# Combine all data into a single dataframe
combined_df = pd.concat(all_evaluator_data, ignore_index=True)

# Save to a single CSV file
combined_df.to_json("combined_evaluator_assignments.json", index=False)

print(f"Combined CSV generated with {len(combined_df)} total assignments across {NUM_EVALUATORS} evaluators.")
combined_df.head()

Combined CSV generated with 300 total assignments across 5 evaluators.


Unnamed: 0,hash,gold,pred,cot,lang,summary,bert_score,patch,bert_quality,usefulness,correctness,explanation_quality,evaluator_id
0,2d4d2f69752c67c7,nit: error starts with capital letter,The regular expression might be too permissive...,Let me analyze this code diff:\n\n1. They're a...,go,Validate that the passed in value is a valid ...,0.816734,"@@ -13,12 +13,15 @@ import (\n )\n \n var (\n-...",low,,,,1
1,18ac7eb4ecdbf532,"...and in turn, this should probably be var up...",The variable names are inconsistent: you're us...,This code is modifying the logic for deciding ...,js,Block dragged bubble. The function to preview ...,0.841119,"@@ -1025,7 +1025,9 @@ Blockly.BlockSvg.prototy...",low,,,,1
2,9073a4fd6c2c8180,As far as I can see you're not rollbacking tra...,The use statements should be alphabetically or...,The code review task is to provide a formal co...,php,Creates a transport that processes a single ba...,0.826215,"@@ -16,6 +16,11 @@ use Ergonode\BatchAction\Do...",low,,,,1
3,10095e717de6b047,"I don't see how removing ""open file bucket"" he...",The error handling change removes important co...,"Looking at the code diff, I need to review the...",go,Private functions - functions escape returns a...,0.882879,"@@ -86,10 +86,10 @@ func openBucket(dir string...",high,,,,1
4,b2f599230d36e5fd,I think you should use `$event->getOriginalFil...,"The field names ""IMAGE_WIDH"" and ""ORIGINAL_IMA...",This code adds image dimensions to the results...,php,Parse loop result and create ImageEvent object...,0.823867,"@@ -361,11 +361,19 @@ class Image extends Base...",low,,,,1


In [91]:
print("\nDistribution analysis by evaluator:")
for e in range(NUM_EVALUATORS):
    evaluator_df = pd.DataFrame(evaluator_data[e])
    print(f"\nEvaluator {e + 1}:")
    print("Languages:")
    print(evaluator_df["lang"].value_counts())
    print("Quality:")
    print(evaluator_df["bert_quality"].value_counts())


Distribution analysis by evaluator:

Evaluator 1:
Languages:
lang
java    14
js      13
py      13
go      11
php      9
Name: count, dtype: int64
Quality:
bert_quality
low       26
medium    18
high      16
Name: count, dtype: int64

Evaluator 2:
Languages:
lang
js      17
php     13
java    11
go      10
py       9
Name: count, dtype: int64
Quality:
bert_quality
low       23
medium    22
high      15
Name: count, dtype: int64

Evaluator 3:
Languages:
lang
py      14
php     13
go      12
js      11
java    10
Name: count, dtype: int64
Quality:
bert_quality
medium    23
high      20
low       17
Name: count, dtype: int64

Evaluator 4:
Languages:
lang
py      17
go      15
java    12
js       8
php      8
Name: count, dtype: int64
Quality:
bert_quality
high      23
medium    21
low       16
Name: count, dtype: int64

Evaluator 5:
Languages:
lang
go      16
java    15
js      11
py      11
php      7
Name: count, dtype: int64
Quality:
bert_quality
low       20
high      20
medium    20