In [2]:
import json
import pandas as pd
import numpy as np
from datasets import load_dataset

In [7]:
# Load the two JSON files
with open('./results/0-3027.json', 'r') as f1, open('./results/3028-5000.json', 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Merge the dictionaries
merged_data = {**data1, **data2}

# Save the merged data into a new JSON file
with open('all_results.json', 'w') as f_out:
    json.dump(merged_data, f_out, indent=4)


In [8]:
len(merged_data)

4687

## Error Summary Functions

In [19]:
def generate_error_dict(bleu_json_path, accuracy_json_path, output_json_path=None):
    """
    Reads BLEU and accuracy data, then generates a dictionary of dictionaries
    for each problem_id (instance_id). This structure can be used in subsequent
    scripts to create textual summaries.

    :param bleu_json_path: Path to a JSON file with BLEU scores.
        Example format:
            {
                "0": [64.1, 22.4],
                "1": [61.5, 20.3],
                "2": [75.2, 30.0],
                ...
            }
        Where each key is a stringified instance ID, and each value is
        a list of BLEU scores, e.g. [bleu_for_n_grams, bleu_for_instance].
    
    :param accuracy_json_path: Path to a JSON file with accuracy/test outcomes.
        Example format:
            {
                "0": [[True, False, -2, False]],
                "1": [[False, False, True]],
                "2": [[True, True, True]],
                ...
            }
        Where each key is a stringified instance ID, and each value is a
        list of lists. Each inner list represents outcomes for each test case
        in a single generated solution:
            - -2   -> Compile error
            - -1   -> Runtime error
            - True  -> Passed test case
            - False -> Failed test case

    :param output_json_path: Optional path to write the dictionary of dictionaries as JSON.
    :return: A dict where each key is a string (instance_id) and value is a dict with:
                {
                    "problem_id": <instance_id>,
                    "bleu_n_gram": float or None,
                    "bleu_instance": float or None,
                    "compile_error_count": int,
                    "runtime_error_count": int,
                    "passed_count": int,
                    "failed_count": int
                }
    """

    # 1. Load BLEU scores
    with open(bleu_json_path, 'r') as f_bleu:
        bleu_data = json.load(f_bleu)
    
    # 2. Load accuracy data
    with open(accuracy_json_path, 'r') as f_acc:
        accuracy_data = json.load(f_acc)
    
    # 3. Construct the dictionary of dictionaries
    error_dict = {}

    # Use the union of keys from both datasets in case there's a mismatch
    all_instance_ids = set(bleu_data.keys()).union(set(accuracy_data.keys()))

    for instance_id in all_instance_ids:
        # Retrieve BLEU scores (with fallback if missing)
        bleu_scores = bleu_data.get(instance_id, [None, None])
        bleu_n_gram = bleu_scores[0] if len(bleu_scores) > 0 else None
        bleu_instance = bleu_scores[1] if len(bleu_scores) > 1 else None
        
        # Retrieve accuracy data (take the first solution's results if multiple)
        accuracy_list_of_lists = accuracy_data.get(instance_id, [])
        test_outcomes = accuracy_list_of_lists[0] if accuracy_list_of_lists else []
        
        # Count each outcome
        compile_error_count = 0
        runtime_error_count = 0
        passed_count = 0
        failed_count = 0
        
        for outcome in test_outcomes:
            if outcome == True:
                passed_count += 1
            elif outcome == False:
                failed_count += 1
            elif outcome == -2:
                compile_error_count += 1
            elif outcome == -1:
                runtime_error_count += 1
        
        # Prepare the dictionary entry
        error_dict[instance_id] = {
            "problem_id": instance_id,
            "bleu_n_gram": bleu_n_gram,
            "bleu_instance": bleu_instance,
            "compile_error_count": compile_error_count,
            "runtime_error_count": runtime_error_count,
            "passed_count": passed_count,
            "failed_count": failed_count
        }
    
    # 4. Optionally write to JSON
    if output_json_path:
        with open(output_json_path, 'w') as f_out:
            json.dump(error_dict, f_out, indent=2)
    
    return error_dict




In [20]:
if __name__ == "__main__":
    # Example usage:
    bleu_json_path = "./results/all_bleu_results.json"       # Path to your BLEU data
    accuracy_json_path = "./results/all_results.json" # Path to your accuracy data
    output_json_path = "./results/error_dict.json"      # Where to store the dictionary of dictionaries

    error_dict = generate_error_dict(
        bleu_json_path,
        accuracy_json_path,
        output_json_path=output_json_path
    )
    
    # Print a sample of the data structure
    for key in list(error_dict.keys())[:5]:
        print(key, ":", error_dict[key])

1053 : {'problem_id': '1053', 'bleu_n_gram': 82.57252967056458, 'bleu_instance': 29.38831642404543, 'compile_error_count': 0, 'runtime_error_count': 0, 'passed_count': 0, 'failed_count': 1}
2645 : {'problem_id': '2645', 'bleu_n_gram': 46.93552706193806, 'bleu_instance': 25.34671756596854, 'compile_error_count': 0, 'runtime_error_count': 0, 'passed_count': 1, 'failed_count': 0}
3325 : {'problem_id': '3325', 'bleu_n_gram': 16.922297644826813, 'bleu_instance': 8.888699118547494, 'compile_error_count': 0, 'runtime_error_count': 0, 'passed_count': 3, 'failed_count': 2}
2336 : {'problem_id': '2336', 'bleu_n_gram': 64.15328590690756, 'bleu_instance': 13.64098684659726, 'compile_error_count': 1, 'runtime_error_count': 0, 'passed_count': 0, 'failed_count': 0}
4377 : {'problem_id': '4377', 'bleu_n_gram': 21.84245032208738, 'bleu_instance': 13.715202144922419, 'compile_error_count': 1, 'runtime_error_count': 0, 'passed_count': 0, 'failed_count': 0}


In [22]:
import json

def generate_prioritized_error_info(bleu_json_path, accuracy_json_path, output_json_path=None):
    """
    Reads BLEU and accuracy data, then generates a dictionary of dictionaries
    reflecting a prioritized approach to code errors:

    Priority:
      1) Compile errors
      2) Runtime errors
      3) Test pass/fail
      4) BLEU similarity

    The final text summary avoids specific numeric references (like '1 compile error')
    and instead provides a coarse description.

    :param bleu_json_path: Path to a JSON file with BLEU scores.
        Example format:
            {
                "0": [64.1, 22.4],
                "1": [61.5, 20.3],
                ...
            }
        Each key is a stringified problem ID. Value is [bleu_for_n_gram, bleu_for_instance].

    :param accuracy_json_path: Path to a JSON with accuracy/test outcomes.
        Example format:
            {
                "0": [[True, False, -2, False]],
                "1": [[False, False, True]],
                ...
            }
        Each key is a stringified problem ID. Each value is a list of lists; each inner list
        represents results for a single solution:
          - -2 -> Compile error
          - -1 -> Runtime error
          - True -> Passed test case
          - False -> Failed test case

    :param output_json_path: Optional path to store the dictionary of dictionaries as JSON.

    :return: Dict of the form:
        {
          "0": {
              "problem_id": "0",
              "internal_counts": {
                  "compile_errors": int,
                  "runtime_errors": int,
                  "passed_tests": int,
                  "failed_tests": int,
                  "total_tests": int
              },
              "error_category": <one of: "compile_error", "runtime_error", "no_pass",
                                 "partial_pass", "all_pass", "no_tests">,
              "bleu_category": <"very_close", "close", "far", "unavailable">,
              "summary": <short textual statement with no numeric detail>
          },
          "1": { ... },
          ...
        }
    """

    # Load BLEU data
    with open(bleu_json_path, 'r') as f_bleu:
        bleu_data = json.load(f_bleu)

    # Load Accuracy data
    with open(accuracy_json_path, 'r') as f_acc:
        accuracy_data = json.load(f_acc)

    # The final container
    prioritized_data = {}

    # Collect all problem IDs from both data sources
    all_ids = set(bleu_data.keys()).union(accuracy_data.keys())

    for problem_id in all_ids:
        # ---- 1) Gather raw data ----
        # BLEU
        bleu_scores = bleu_data.get(problem_id, [None, None])
        bleu_n_gram = bleu_scores[0]
        bleu_instance = bleu_scores[1]

        # Accuracy
        # By default, consider only the FIRST solution in accuracy_data for summary
        # (adjust if your workflow is different)
        solution_outcomes = []
        if problem_id in accuracy_data and len(accuracy_data[problem_id]) > 0:
            solution_outcomes = accuracy_data[problem_id][0]

        # Count compile errors, runtime errors, pass/fails
        compile_errors = 0
        runtime_errors = 0
        passed_tests = 0
        failed_tests = 0

        for outcome in solution_outcomes:
            if outcome == -2:
                compile_errors += 1
            elif outcome == -1:
                runtime_errors += 1
            elif outcome is True:
                passed_tests += 1
            elif outcome is False:
                failed_tests += 1

        total_tests = len(solution_outcomes)

        # ---- 2) Determine coarse categories based on priority logic ----
        # Highest priority: compile error
        if compile_errors > 0:
            error_category = "compile_error"
        # Next priority: runtime error
        elif runtime_errors > 0:
            error_category = "runtime_error"
        else:
            # If code compiles and runs, we check test results
            if total_tests == 0:
                # No tests available => cannot confirm pass/fail
                error_category = "no_tests"
            else:
                # Some tests exist
                if passed_tests == total_tests:
                    error_category = "all_pass"
                elif passed_tests == 0:
                    error_category = "no_pass"
                else:
                    error_category = "partial_pass"

        # BLEU categorization (only if we have numeric data)
        if bleu_n_gram is None or bleu_instance is None:
            bleu_category = "unavailable"
        else:
            avg_bleu = (bleu_n_gram + bleu_instance) / 2.0
            if avg_bleu >= 60:
                bleu_category = "very_close"
            elif avg_bleu >= 30:
                bleu_category = "close"
            else:
                bleu_category = "far"

        # ---- 3) Build a text summary that avoids numeric references ----
        # Summaries reflect priority:
        #  - If compile_error => highlight "fails to compile"
        #  - If runtime_error => highlight "crashes at runtime"
        #  - Otherwise describe pass/fail
        #  - Then mention BLEU in a coarse sense

        summary_parts = []

        # Error portion
        if error_category == "compile_error":
            summary_parts.append("The code fails to compile.")
            summary_parts.append("Fix compilation issues first.")
        elif error_category == "runtime_error":
            summary_parts.append("The code compiles but crashes at runtime.")
            summary_parts.append("Fix runtime errors next.")
        elif error_category == "no_tests":
            summary_parts.append("The code runs, but there are no test results available.")
        elif error_category == "all_pass":
            summary_parts.append("The code compiles and passes all tests.")
        elif error_category == "no_pass":
            summary_parts.append("The code compiles but does not pass any tests.")
        elif error_category == "partial_pass":
            summary_parts.append("The code compiles and passes some tests but not all.")

        # BLEU portion (coarse mention)
        if bleu_category == "very_close":
            summary_parts.append("It appears very similar to the reference solution.")
        elif bleu_category == "close":
            summary_parts.append("It has moderate similarity to the reference.")
        elif bleu_category == "far":
            summary_parts.append("It is significantly different from the reference.")
        # if "unavailable", we just omit mention of BLEU

        final_summary = " ".join(summary_parts).strip()

        # ---- 4) Store everything in final structure ----
        prioritized_data[problem_id] = {
            "problem_id": problem_id,
            "internal_counts": {
                "compile_errors": compile_errors,
                "runtime_errors": runtime_errors,
                "passed_tests": passed_tests,
                "failed_tests": failed_tests,
                "total_tests": total_tests
            },
            "error_category": error_category,
            "bleu_category": bleu_category,
            "summary": final_summary
        }

    # ---- 5) Optionally save to a JSON file ----
    if output_json_path:
        with open(output_json_path, 'w') as f_out:
            json.dump(prioritized_data, f_out, indent=2)

    return prioritized_data


if __name__ == "__main__":
    """
    Example usage. Adjust file paths as needed, and run:
        python script_name.py
    """
    # Paths to your data
    bleu_json_path = "./results/all_bleu_results.json"
    accuracy_json_path = "./results/all_results.json"
    output_json_path = "./results/textual_errors.json"

    data = generate_prioritized_error_info(
        bleu_json_path,
        accuracy_json_path,
        output_json_path=output_json_path
    )

    # Print a few samples to see the final structure + summary
    for k in list(data.keys())[:5]:
        print(k, "=>", data[k])


1053 => {'problem_id': '1053', 'internal_counts': {'compile_errors': 0, 'runtime_errors': 0, 'passed_tests': 0, 'failed_tests': 1, 'total_tests': 1}, 'error_category': 'no_pass', 'bleu_category': 'close', 'summary': 'The code compiles but does not pass any tests. It has moderate similarity to the reference.'}
2645 => {'problem_id': '2645', 'internal_counts': {'compile_errors': 0, 'runtime_errors': 0, 'passed_tests': 1, 'failed_tests': 0, 'total_tests': 1}, 'error_category': 'all_pass', 'bleu_category': 'close', 'summary': 'The code compiles and passes all tests. It has moderate similarity to the reference.'}
3325 => {'problem_id': '3325', 'internal_counts': {'compile_errors': 0, 'runtime_errors': 0, 'passed_tests': 3, 'failed_tests': 2, 'total_tests': 5}, 'error_category': 'partial_pass', 'bleu_category': 'far', 'summary': 'The code compiles and passes some tests but not all. It is significantly different from the reference.'}
2336 => {'problem_id': '2336', 'internal_counts': {'compile

## Preprocessing data for Finetuning

In [3]:
with open("./results/textual_errors.json") as file:
    textual_errors = json.load(file)

len(textual_errors)

4882

In [4]:
textual_errors

{'1053': {'problem_id': '1053',
  'internal_counts': {'compile_errors': 0,
   'runtime_errors': 0,
   'passed_tests': 0,
   'failed_tests': 1,
   'total_tests': 1},
  'error_category': 'no_pass',
  'bleu_category': 'close',
  'summary': 'The code compiles but does not pass any tests. It has moderate similarity to the reference.'},
 '2645': {'problem_id': '2645',
  'internal_counts': {'compile_errors': 0,
   'runtime_errors': 0,
   'passed_tests': 1,
   'failed_tests': 0,
   'total_tests': 1},
  'error_category': 'all_pass',
  'bleu_category': 'close',
  'summary': 'The code compiles and passes all tests. It has moderate similarity to the reference.'},
 '3325': {'problem_id': '3325',
  'internal_counts': {'compile_errors': 0,
   'runtime_errors': 0,
   'passed_tests': 3,
   'failed_tests': 2,
   'total_tests': 5},
  'error_category': 'partial_pass',
  'bleu_category': 'far',
  'summary': 'The code compiles and passes some tests but not all. It is significantly different from the referen

In [10]:
df_data = []
for key, value in textual_errors.items():
    df_data.append({'problem_id': value['problem_id'], 'summary': value['summary']})

error_df = pd.DataFrame(df_data)
print(error_df.head(5))


  problem_id                                            summary
0       1053  The code compiles but does not pass any tests....
1       2645  The code compiles and passes all tests. It has...
2       3325  The code compiles and passes some tests but no...
3       2336  The code fails to compile. Fix compilation iss...
4       4377  The code fails to compile. Fix compilation iss...


In [12]:
apps = load_dataset("codeparrot/apps",split="train")

In [13]:
apps_df = apps.to_pandas()

In [14]:
apps_df

Unnamed: 0,problem_id,question,solutions,input_output,difficulty,url,starter_code
0,0,Polycarp has $n$ different binary words. A wor...,"[""for _ in range(int(input())):\n n = int(i...","{\n ""inputs"": [\n ""4\n4\n0001\n1000\n0011\...",interview,https://codeforces.com/problemset/problem/1259/D,
1,1,Mikhail walks on a Cartesian plane. He starts ...,"[""q=int(input())\n\nfor e in range(q):\n x,...","{\n ""inputs"": [\n ""3\n2 2 3\n4 3 7\n10 1 9...",interview,https://codeforces.com/problemset/problem/1036/B,
2,2,"You are given three sequences: $a_1, a_2, \ldo...","[""import sys\nimport random\nfrom fractions im...","{\n ""inputs"": [\n ""5\n3\n1 1 1\n2 2 2\n3 3...",interview,https://codeforces.com/problemset/problem/1408/A,
3,3,"You have $n$ barrels lined up in a row, number...","[""def solve():\n n, k = map(int,input().spl...","{\n ""inputs"": [\n ""2\n4 1\n5 5 5 5\n3 2\n0...",interview,https://codeforces.com/problemset/problem/1430/B,
4,4,"You are given a permutation $p=[p_1, p_2, \ldo...","[""for _ in range(int(input())):\n input()\n...","{\n ""inputs"": [\n ""3\n6\n4 5 1 3 2 6\n5\n5...",interview,https://codeforces.com/problemset/problem/1265/B,
...,...,...,...,...,...,...,...
4995,4995,Another rewarding day in the fast-paced world ...,"[""class HTMLGen:\n def __init__(self):\n ...","{""fn_name"": ""__init__"", ""inputs"": [], ""outputs...",introductory,https://www.codewars.com/kata/54eecc187f9142cc...,\ndef __init__(self):\n\t
4996,4996,## **Instructions**\n\nThe goal of this kata i...,"[""def fibs_fizz_buzz(n):\n a, b, out = 0, 1...","{""fn_name"": ""fibs_fizz_buzz"", ""inputs"": [], ""o...",introductory,https://www.codewars.com/kata/57bf599f102a39bb...,\ndef fibs_fizz_buzz(n):\n\t
4997,4997,"The function sigma 1, σ1 in mathematics, is kn...","[""cache = {}\ndef sum_div(x):\n if x not in...","{""fn_name"": ""sigma1"", ""inputs"": [], ""outputs"":...",introductory,https://www.codewars.com/kata/5619dbc22e69620e...,\ndef sigma1(n):\n\t
4998,4998,The principal of a school likes to put challen...,"[""def wanted_words(vowels, consonants, forbidd...","{""fn_name"": ""wanted_words"", ""inputs"": [[1, 7, ...",introductory,https://www.codewars.com/kata/580be55ca671827c...,"\ndef wanted_words(n, m, forbid_let):\n\t"


In [17]:
apps_df = apps_df[["problem_id","question","solutions"]]

In [18]:
apps_df

Unnamed: 0,problem_id,question,solutions
0,0,Polycarp has $n$ different binary words. A wor...,"[""for _ in range(int(input())):\n n = int(i..."
1,1,Mikhail walks on a Cartesian plane. He starts ...,"[""q=int(input())\n\nfor e in range(q):\n x,..."
2,2,"You are given three sequences: $a_1, a_2, \ldo...","[""import sys\nimport random\nfrom fractions im..."
3,3,"You have $n$ barrels lined up in a row, number...","[""def solve():\n n, k = map(int,input().spl..."
4,4,"You are given a permutation $p=[p_1, p_2, \ldo...","[""for _ in range(int(input())):\n input()\n..."
...,...,...,...
4995,4995,Another rewarding day in the fast-paced world ...,"[""class HTMLGen:\n def __init__(self):\n ..."
4996,4996,## **Instructions**\n\nThe goal of this kata i...,"[""def fibs_fizz_buzz(n):\n a, b, out = 0, 1..."
4997,4997,"The function sigma 1, σ1 in mathematics, is kn...","[""cache = {}\ndef sum_div(x):\n if x not in..."
4998,4998,The principal of a school likes to put challen...,"[""def wanted_words(vowels, consonants, forbidd..."


In [19]:
with open("../Responses/response.json") as file:
    generated_response = json.load(file)

In [22]:
generated_response[1]

{'task_id': 2,
 'prompt': "You are given three sequences: $a_1, a_2, \\ldots, a_n$; $b_1, b_2, \\ldots, b_n$; $c_1, c_2, \\ldots, c_n$.\n\nFor each $i$, $a_i \\neq b_i$, $a_i \\neq c_i$, $b_i \\neq c_i$.\n\nFind a sequence $p_1, p_2, \\ldots, p_n$, that satisfy the following conditions:\n\n\n\n $p_i \\in \\{a_i, b_i, c_i\\}$\n\n $p_i \\neq p_{(i \\mod n) + 1}$.\n\nIn other words, for each element, you need to choose one of the three possible values, such that no two adjacent elements (where we consider elements $i,i+1$ adjacent for $i<n$ and also elements $1$ and $n$) will have equal value.\n\nIt can be proved that in the given constraints solution always exists. You don't need to minimize/maximize anything, you need to find any proper sequence.\n\n\n-----Input-----\n\nThe first line of input contains one integer $t$ ($1 \\leq t \\leq 100$): the number of test cases.\n\nThe first line of each test case contains one integer $n$ ($3 \\leq n \\leq 100$): the number of elements in the give

In [25]:
error_df_1 = pd.DataFrame(generated_response)
error_df_1.head(5)

Unnamed: 0,task_id,prompt,solution,generated_output
0,1,Mikhail walks on a Cartesian plane. He starts ...,"[""q=int(input())\n\nfor e in range(q):\n x,...",```python\ndef solve():\n q = int(input())\n ...
1,2,"You are given three sequences: $a_1, a_2, \ldo...","[""import sys\nimport random\nfrom fractions im...",```python\ndef solve():\n n = int(input())\n ...
2,3,"You have $n$ barrels lined up in a row, number...","[""def solve():\n n, k = map(int,input().spl...","```python\ndef solve():\n n, k = map(int, inp..."
3,5,The sequence of $m$ integers is called the per...,"[""def possible(a):\n ans = set()\n s = s...",```python\ndef solve():\n n = int(input())\...
4,6,Arthur owns a ski resort on a mountain. There ...,"[""import sys\ninput = sys.stdin.readline\nfor ...","```python\ndef solve():\n n, m = map(int, i..."


In [26]:
error_df_1 = error_df_1.rename(columns={"task_id":"problem_id"})
error_df_1.head(5)

Unnamed: 0,problem_id,prompt,solution,generated_output
0,1,Mikhail walks on a Cartesian plane. He starts ...,"[""q=int(input())\n\nfor e in range(q):\n x,...",```python\ndef solve():\n q = int(input())\n ...
1,2,"You are given three sequences: $a_1, a_2, \ldo...","[""import sys\nimport random\nfrom fractions im...",```python\ndef solve():\n n = int(input())\n ...
2,3,"You have $n$ barrels lined up in a row, number...","[""def solve():\n n, k = map(int,input().spl...","```python\ndef solve():\n n, k = map(int, inp..."
3,5,The sequence of $m$ integers is called the per...,"[""def possible(a):\n ans = set()\n s = s...",```python\ndef solve():\n n = int(input())\...
4,6,Arthur owns a ski resort on a mountain. There ...,"[""import sys\ninput = sys.stdin.readline\nfor ...","```python\ndef solve():\n n, m = map(int, i..."


In [27]:
error_df.head(5)

Unnamed: 0,problem_id,summary
0,1053,The code compiles but does not pass any tests....
1,2645,The code compiles and passes all tests. It has...
2,3325,The code compiles and passes some tests but no...
3,2336,The code fails to compile. Fix compilation iss...
4,4377,The code fails to compile. Fix compilation iss...


In [29]:
type(error_df_1["problem_id"][1])

numpy.int64

In [30]:
type(error_df["problem_id"][1])

str

In [36]:
error_df["problem_id"] = error_df['problem_id'].astype(np.int64)

In [37]:
type(error_df["problem_id"][1])

numpy.int64

In [39]:
error_summary_df = pd.merge(error_df, error_df_1, on='problem_id', how='inner')
error_summary_df.head(5)

Unnamed: 0,problem_id,summary,prompt,solution,generated_output
0,1053,The code compiles but does not pass any tests....,Chef has a binary array in an unsorted manner....,"[""# cook your dish here\nfor _ in range(int(in...",```python\ndef solve():\n n = int(input())\n ...
1,2645,The code compiles and passes all tests. It has...,=====Function Descriptions=====\nitertools.com...,"[""from itertools import *\ns,n = input().split...",```python\nfrom itertools import combinations_...
2,3325,The code compiles and passes some tests but no...,"Given a string, s, return a new string that or...","[""from collections import Counter\n\ndef most_...",```python\nfrom collections import Counter\n\n...
3,2336,The code fails to compile. Fix compilation iss...,Petr likes to come up with problems about rand...,"[""n = int(input())\nl = [int(x) - 1 for x in i...",```python\nn = int(input())\npermutation = lis...
4,4377,The code fails to compile. Fix compilation iss...,Alice and Bob have participated to a Rock Off ...,"[""def solve(a, b):\n alice = sum(i > j for ...","```python\ndef rock_off_winner(alice_scores, b..."


In [40]:
len(error_summary_df)

4882

In [42]:
error_summary_df.tail(5)

Unnamed: 0,problem_id,summary,prompt,solution,generated_output
4877,3357,The code fails to compile. Fix compilation iss...,You are given an array of `n+1` integers `1` t...,"[""def find_dup(arr):\n return sum(arr) - le...",def solve():\n n = int(input())\n arr = list...
4878,2282,The code compiles and passes some tests but no...,A rectangle with sides $A$ and $B$ is cut into...,"[""n =int(input())\nw=[]\nh=[]\nc=[]\ncntw={}\n...",def solve():\n n = int(input())\n arr = list...
4879,582,The code compiles but does not pass any tests....,You may have tried your level best to help Che...,"[""import sys\nimport bisect as bi\nimport math...",def solve():\n n = int(input())\n arr = list...
4880,4504,The code fails to compile. Fix compilation iss...,Share price\n===========\n\nYou spent all your...,"[""def share_price(invested, changes):\n for...",def solve():\n n = int(input())\n arr = list...
4881,4231,The code compiles and passes some tests but no...,- Input: Integer `n`\n- Output: String\n\nExam...,"[""def a(n):\n \""\""\""\n \""\""\""\n if n ...",def solve():\n n = int(input())\n arr = list...


In [43]:
error_summary_df.to_csv("./results/error_summary.csv",index=False)

In [3]:
error_summary_df = pd.read_csv("./results/error_summary.csv")

In [4]:
error_summary_df["summary"].iloc[0]

'The code compiles but does not pass any tests. It has moderate similarity to the reference.'