In [1]:
%autoreload 2

In [2]:
from collections import defaultdict
import copy
import itertools
import os
import sys

from IPython.display import display, Markdown, HTML
from Levenshtein import distance as _edit_distance
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

sys.path.append(os.path.abspath('..'))

import llm_feedback.pilot.tasks as tasks
from llm_feedback.pilot.tasks import mbpp
from llm_feedback.utils.io import read_json


In [3]:
# MBPP_OUTPUT_PATH = '../outputs/gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__mbpp-test-gen__train__2023_07_23_outputs.jsonl'
# MBPP_OUTPUT_PATH = '../outputs/gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__mbpp-test-gen__train__shuffle_test__2023_07_23_outputs.jsonl'
MBPP_OUTPUT_PATH = '../outputs/gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__gpt-3.5-turbo-0613__mbpp-test-gen__train__2023_07_24_outputs.jsonl'

mbpp_outputs = read_json(MBPP_OUTPUT_PATH)

In [4]:
# TEST_ID_KEYS = ['0', '1', '2']
GOLD_TEST_KEYS = ('test_list_0', 'test_list_1', 'test_list_2')
INITIAL_SOLUTION = 'initial_solution'
GOLD_CODE = 'gold_code'
REFINEMENT = 'refinement'
COMPLETION_ID_KEY = 'completion_id'
RESULT_FIELD = 'result'
PASSED_FIELD = 'passed'
GOLD_TEST = 'gold_test'
MODEL_TEST = 'model_test'

SOLUTION_TYPES = [INITIAL_SOLUTION, GOLD_CODE]
if REFINEMENT in mbpp_outputs[0]:
    SOLUTION_TYPES.append(REFINEMENT)    

TEST_TYPES = [GOLD_TEST, MODEL_TEST]

accuracy_by_problem = {solution: {test_type: [] for test_type in TEST_TYPES} for solution in SOLUTION_TYPES}
all_problem_result_summaries = []
num_model_tests_by_problem = []

missing_names = defaultdict(set)
exception_types = defaultdict(set)

for problem_id, problem_results in enumerate(mbpp_outputs):
    problem_results_summary = {solution: {test_type: [] for test_type in TEST_TYPES} for solution in SOLUTION_TYPES}
    test_id_keys = [key for key in problem_results.keys() if key.isdigit()]
    for test_id in sorted(test_id_keys):
        test_results = problem_results[test_id]
        for i, code_test_results in test_results:
            code_type = SOLUTION_TYPES[code_test_results[COMPLETION_ID_KEY]]
            test_type = GOLD_TEST if int(test_id) < 3 else MODEL_TEST
            code_test_result = code_test_results[RESULT_FIELD]
            if code_test_result is None:
                print(f'Problem {problem_id}, test {test_id} has result None')
            if 'is not defined' in code_test_result.lower():
                # print(f'Problem {problem_id}, test {test_id} has defintion issues: {code_test_result}')
                name_index = code_test_result.find('name')
                if name_index != -1:
                    space_index = code_test_result.find(' ', name_index)
                    next_space_index = code_test_result.find(' ', space_index + 1)
                    missing_name = code_test_result[space_index + 1:next_space_index]
                    missing_names[missing_name.replace("'", '')].add(problem_id)
            if '<class' in code_test_result:
                exception_class_index = code_test_result.find('<class')
                exception_start = code_test_result.find("'", exception_class_index)
                exception_end = code_test_result.find("'", exception_start + 1)
                exception_type = code_test_result[exception_start + 1:exception_end]
                exception_types[exception_type].add(problem_id)
            code_test_passed = code_test_results[PASSED_FIELD]
            if code_test_passed is None:
                print(f'Problem {problem_id}, test {test_id} has passed None')
                code_test_passed = False
            problem_results_summary[code_type][test_type].append(code_test_passed)

    all_problem_result_summaries.append(problem_results_summary)
    for model_type in SOLUTION_TYPES:
        for test_type in TEST_TYPES:
            model_problem_results = problem_results_summary[model_type][test_type]
            if model_type == INITIAL_SOLUTION and test_type == GOLD_TEST:
                model_problem_results = model_problem_results[1:]
            accuracy_by_problem[model_type][test_type].append(np.mean(model_problem_results))
            

for model_key, key_results in accuracy_by_problem.items():
    for test_key, test_key_results in key_results.items():
        print(f'{model_key} {test_key} accuracy: {np.mean(test_key_results)}')


num_model_tests_by_problem = [len(t[INITIAL_SOLUTION][MODEL_TEST]) for t in all_problem_result_summaries]
print(np.mean(num_model_tests_by_problem), np.std(num_model_tests_by_problem), np.min(num_model_tests_by_problem), np.max(num_model_tests_by_problem))

# print('Problems with fewest model tests:')
# for problem_id in np.argsort(num_model_tests_by_problem)[:10]:
#     print(f'Problem {problem_id}: {num_model_tests_by_problem[problem_id]}')

initial_solution gold_test accuracy: 0.83
initial_solution model_test accuracy: 0.7279762459762459
gold_code gold_test accuracy: 1.0
gold_code model_test accuracy: 0.6748413253413255
refinement gold_test accuracy: 0.8266666666666667
refinement model_test accuracy: 0.7228666611166611
6.05 3.0639027399707057 3 21


In [13]:
examples_with_change = {}

for test_type in TEST_TYPES:
    refinement_better_count = 0
    refinement_worse_count = 0
    examples_with_change[test_type] = []

    for i, rs in enumerate(all_problem_result_summaries):
        for j in range(len(rs[INITIAL_SOLUTION][test_type])):
            if rs[INITIAL_SOLUTION][test_type][j] != rs[REFINEMENT][test_type][j]:
                examples_with_change[test_type].append(i)
                print(f'Problem {i}, test {j} model test results differ: {rs[INITIAL_SOLUTION][test_type][j]}, {rs[REFINEMENT][test_type][j]}')
                if rs[REFINEMENT][test_type][j]:
                    refinement_better_count += 1
                else:
                    refinement_worse_count += 1

    print(f'On test type {test_type}, refinement better: {refinement_better_count}, worse: {refinement_worse_count}\n')

print(f'Examples with change in both test types: {set(examples_with_change[GOLD_TEST]).intersection(set(examples_with_change[MODEL_TEST]))}')

Problem 3, test 1 model test results differ: True, False
Problem 8, test 2 model test results differ: False, True
Problem 40, test 0 model test results differ: True, False
Problem 40, test 1 model test results differ: True, False
Problem 40, test 2 model test results differ: True, False
Problem 44, test 0 model test results differ: True, False
Problem 44, test 1 model test results differ: True, False
Problem 44, test 2 model test results differ: True, False
Problem 45, test 0 model test results differ: True, False
Problem 45, test 1 model test results differ: True, False
Problem 59, test 0 model test results differ: False, True
Problem 59, test 2 model test results differ: False, True
Problem 67, test 0 model test results differ: False, True
Problem 67, test 1 model test results differ: False, True
Problem 67, test 2 model test results differ: False, True
On test type gold_test, refinement better: 6, worse: 9

Problem 8, test 1 model test results differ: True, False
Problem 8, test 2 m

### Matched-pairs t-tests

In [None]:
for test_type in TEST_TYPES:
    for first_model_type, second_model_type in itertools.combinations(SOLUTION_TYPES, 2):
        first_model_test_results = [rs[first_model_type][test_type] for rs in all_problem_result_summaries]
        second_model_test_results = [rs[second_model_type][test_type] for rs in all_problem_result_summaries]
        
        first_model_test_results = np.array([t for tl in first_model_test_results for t in tl], dtype=float)
        second_model_test_results = np.array([t for tl in second_model_test_results for t in tl], dtype=float)

        result = stats.ttest_rel(first_model_test_results, second_model_test_results)

        print(f'For {test_type} tests on {first_model_type} vs {second_model_type}: {result.statistic:.4f}, {result.pvalue:.4f}')




In [None]:
edit_distances = []

for i, output in enumerate(mbpp_outputs):
    model_tests = set(output['model_test_cases'])
    gold_tests = set(output['test_cases'])
    intersection = model_tests.intersection(gold_tests)
    if intersection:
        print(f'Found overlapping tests in #{i}: {intersection}')

    output_edit_distances = [
        _edit_distance(model_test, gold_test)
        for model_test in model_tests
        for gold_test in gold_tests
    ]
    edit_distances.append(output_edit_distances)


min_edit_distances = [min(dists) for dists in edit_distances]
mean_edit_distances = [np.mean(dists) for dists in edit_distances]

print(f'Mean-min edit distance: {np.mean(min_edit_distances)}')
print(f'Mean-mean edit distance: {np.mean(mean_edit_distances)}')

sorted_indices = np.argsort(min_edit_distances)
for i in range(10):
    idx = sorted_indices[i]
    print(f'#{i} ({idx}) min edit distance: {min_edit_distances[idx]}')

In [None]:
def visualize_outputs(index):
    output = mbpp_outputs[index]
    display(Markdown(f'Problem text: {output["text"]}'))
    
    code_block_lines = [
        '```python', 
        '# Gold code:', 
        output['gold_code'], 
        '',
        '# Model code:',
        output['initial_solution'],
        '',
    ]

    for i, test in enumerate(output['test_cases']): 
        code_block_lines.append(f'# Gold Test #{i} (Gold passed = {all_problem_result_summaries[index][GOLD_CODE][GOLD_TEST][i]}, Model passed = {all_problem_result_summaries[index][INITIAL_SOLUTION][GOLD_TEST][i]}):')
        code_block_lines.append(test)

    code_block_lines.append('')

    for i, test in enumerate(output['model_test_cases']):
        code_block_lines.append(f'# Model Test #{i} (Gold passed = {all_problem_result_summaries[index][GOLD_CODE][MODEL_TEST][i]}, Model passed = {all_problem_result_summaries[index][INITIAL_SOLUTION][MODEL_TEST][i]}):')
        code_block_lines.append(test)

    code_block_lines.append('```')
    display(Markdown('\n'.join(code_block_lines)))

In [None]:
gold_code_model_test_accuracies = []


for i, acc in enumerate(accuracy_by_problem[INITIAL_SOLUTION][GOLD_TEST]):
    if acc != 1.0:
        gold_code_model_test_acc = accuracy_by_problem[GOLD_CODE][MODEL_TEST][i]
        print(i, acc,gold_code_model_test_acc )
        gold_code_model_test_accuracies.append(gold_code_model_test_acc)

plt.hist(gold_code_model_test_accuracies, bins=10)
plt.title('(Gold Code | Model Tests) Accuracy where (Model Code | Gold Tests) Accuracy < 1.0')
# accuracy_by_problem

In [None]:
[i for i, out in enumerate(mbpp_outputs) if 'prime' in out['text']]

In [None]:
visualize_outputs(84)


In [None]:
len(model_solution_test_results)

In [None]:
for i, acc in enumerate(accuracy_by_problem[INITIAL_SOLUTION][MODEL_TEST]):
    gold_code_model_test_acc = accuracy_by_problem[GOLD_CODE][MODEL_TEST][i]
    if acc > gold_code_model_test_acc:
        print(i, acc, gold_code_model_test_acc)


In [None]:
visualize_outputs(48)


In [None]:
for out in mbpp_outputs:
    if out['test_list_0'].count('==') != 1:
        print(out['test_list_0'])

In [None]:
tl = [mbpp_outputs[48]['test_list_0'], mbpp_outputs[48]['test_list_1'], mbpp_outputs[48]['test_list_2']]
cases, expected_outputs = zip(*[t.split('==') for t in tl])

In [None]:
import datetime
datetime.datetime.now().strftime('%Y_%m_%d')

In [None]:
accuracy_by_solution_type = {k: np.mean(v) for k, v in accuracy_by_problem.items()}
accuracy_by_solution_type

In [None]:
full_accuracy_rate_by_solution_type = {k: np.mean(np.array(v) == 1.0) for k, v in accuracy_by_problem.items()}
full_accuracy_rate_by_solution_type

In [None]:
feedback_acuracy_change_by_problem = np.array([accuracy_by_problem[REFINEMENT][i] - accuracy_by_problem[INITIAL_SOLUTION][i] for i in range(len(accuracy_by_problem[REFINEMENT]))])
feedback_change_indices = np.where(feedback_acuracy_change_by_problem != 0)[0]
feedback_acuracy_change_by_problem[feedback_change_indices], feedback_change_indices, len(feedback_change_indices)

In [None]:
for i, delta in enumerate(feedback_acuracy_change_by_problem):
    if delta < 0:
        print(i, mbpp_outputs[i]['feedback'])

In [None]:
# SyntaxError, ZeroDivisionError, TypeError, NotImplementedError
# Indices to re-check: 34, 52, 57, 85
exception_types['SyntaxError']

In [None]:
i =57
print(mbpp_outputs[i]['text'])
print(mbpp_outputs[i]['initial_solution'])
print(mbpp_outputs[i]['feedback'])
print(mbpp_outputs[i]['refinement'])

for test_id in TEST_ID_KEYS:
    print(mbpp_outputs[i][f'test_list_{test_id}'])
    results = [test_results[1]['result'] for test_results in mbpp_outputs[i][test_id]]
    print(results)


In [None]:
import heapq

def heap_sort(lst):
    heap = []
    for value in lst:
        heapq.heappush(heap, value)
    
    ordered = []
    while heap:
        ordered.append(heapq.heappop(heap))
    
    return ordered

assert heap_sort([1, 3, 5, 7, 9, 2, 4, 6, 8, 0])==[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
assert heap_sort([25, 35, 22, 85, 14, 65, 75, 25, 58])==[14, 22, 25, 25, 35, 58, 65, 75, 85]
assert heap_sort( [7, 1, 9, 5])==[1,5,7,9]


In [None]:
mbpp_outputs[i]['0']

In [None]:
plt.hist(feedback_acuracy_change_by_problem, bins=20)
plt.show()
plt.hist(feedback_acuracy_change_by_problem[feedback_change_indices])
plt.title('Change in test pass rate after feedback')
plt.ylabel('Count')
plt.xlabel('Change in test pass rate')
plt.show()

In [None]:
from evaluate import load
import re
code_eval = load("guydav/restrictedpython_code_eval")
markdown_pattern = re.compile(r"```\w*")

import os
os.environ["HF_ALLOW_CODE_EVAL"] = '1'

ALLOWED_IMPORTS = ['typing', 'collections', 'math', 're', 'heapq', 'itertools', 'sys']
DEFAULT_ADDITIONAL_GLOBALS = {
    'all': all,
    'dict': dict,
    'filter': filter,
    'map': map,
    'max': max,
    'min': min,
    'sum': sum,
    'enumerate': enumerate,
    'reversed': reversed,
    'iter': iter,
}

In [None]:


# output = mbpp_outputs[95]
output = copy.deepcopy(mbpp_outputs[52])

model_test_cases = mbpp._parse_test_cases(output['feedback'])
test_cases = [output[key] for key in ('test_list_0', 'test_list_1', 'test_list_2')] + model_test_cases
solutions = [output['initial_solution'], output['gold_code']]
solutions = [markdown_pattern.sub('', solution).strip() for solution in solutions]
solutions = [solution.replace('(object)', '') for solution in solutions]
if output['test_setup_code']:
    solutions = ['\n'.join([output['test_setup_code'], solution]) for solution in solutions]

r = code_eval.compute(
    references=test_cases, 
    predictions=[solutions] * len(test_cases), 
    k=[len(solutions)],
    allowed_imports=ALLOWED_IMPORTS,
    additional_globals=DEFAULT_ADDITIONAL_GLOBALS,
    timeout=60,
    allow_str_format=True,
    allow_underscore_variable_names=True,
    )[1]  # type: ignore

r