In [None]:
MODEL_NAMES = [
    'DeepSeek Coder 1.3B',
    'DeepSeek Coder 6.7B',
    'DeepSeek Coder 33B',
    'CodeLlama 7B',
    'CodeLlama 13B',
    'CodeLlama 34B',
    'GPT-3.5-turbo',
    'GPT-4-turbo'
]

MODEL_NAMES_SHORT = [
    'DSC 1.3B',
    'DSC 6.7B',
    'DSC 33B',
    'CL 7B',
    'CL 13B',
    'CL 34B',
    'GPT 3.5',
    'GPT 4'
]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [None]:
language = 'java'
instances = json.load(open(f'../data/input/CoderEval4{language.capitalize()}.json'))['RECORDS']
id_generatedby_touse = list(pd.read_csv(f'../constants/{language}_id_generatedby_touse.csv').id_generatedby)

# Results

In [None]:
from sklearn import metrics

In [None]:
# choose the prompt and the language you want to analyze
input_file = '../../data/code_generation/results/cg_judgement_java_automatedCoT.csv'

In [None]:
# import data and add "level" field
judgments = pd.read_csv(input_file)
judgments['id_generatedby'] = judgments['target_id'] + '_' + judgments['generated_by']
level = []
for tid in judgments.target_id:
    l = [i['level'] for i in instances if i['_id'] == str(tid)][0]
    level.append(l)
judgments['level'] = level

In [None]:
########### RUN THIS CELL IF YOU WANT TO CONSIDER ONLY METHODS WITH NO EXTERNAL DEPENDENCIES ###########
judgments = judgments.loc[(judgments.level == 'self_contained') | (judgments.level == 'slib_runnable')]

In [None]:
print(f'Shape before cleaning : {judgments.shape}.')
judgments.dropna(subset = 'generated_code', inplace = True) # exlude all the cases in which the model in charge of the code generation was not able to produce a valid prediction
judgments = judgments.loc[judgments.id_generatedby.isin(id_generatedby_touse)]
print(f'Shape after cleaning : {judgments.shape}.')
print()

for col in [c for c in judgments.columns if '_rating' in c]:
    judgments[col] = judgments[col].apply(lambda x : int(x) if x != '-' else x)
    print('{0} was not able to generate a valid judgement {1} times out of {2}'.format(col.split('_rating')[0], judgments.loc[judgments[col] == '-'].shape[0], judgments.shape[0]))

In [None]:
num_rows, num_cols = 4, 2
fig, axs = plt.subplots(num_rows, num_cols, sharex = True, sharey = True, figsize = (8,15))
models = ['deepseek-coder-1.3b-instruct', 'deepseek-coder-6.7b-instruct', 'deepseek-coder-33b-instruct', 'CodeLlama-7b-Instruct-hf', 'CodeLlama-13b-Instruct-hf', 'CodeLlama-34b-Instruct-hf', 'gpt-3.5-turbo', 'gpt-4-turbo']
model_label = MODEL_NAMES

for i in range(num_rows):
    for j in range(num_cols):
        model = models[num_cols * i + j]
        judgments_temp = judgments.loc[judgments[f'{model}_rating'] != '-']
        judgments_temp[f'{model}_rating'] = judgments_temp[f'{model}_rating'].apply(lambda x : int(x))
        actual = judgments_temp.is_pass
        predicted = judgments_temp[f'{model}_rating']
        confusion_matrix = metrics.confusion_matrix(actual, predicted, normalize = 'true')
        cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
        cm_display.plot(cmap = 'Greys', ax = axs[i,j], values_format = '.2f')
        cm_display.im_.set_clim(0, 1)
        axs[i,j].set_title(f'{model_label[num_cols * i + j]} ({judgments_temp.shape[0]})')
        axs[i,j].set_xlabel('') if i != num_rows - 1 else axs[i,j].set_xlabel('Judged', fontsize = 12)
        axs[i,j].set_ylabel('') if j != 0 else axs[i,j].set_ylabel('Test output', fontsize = 12)

        axs[i,j].set_yticks(ticks = [0, 1], labels = ['fail', 'pass'])
        axs[i,j].set_xticks(ticks = [0, 1], labels = ['wrong', 'correct'])

plt.show()

# Kappa score
Evaluate the Kappa agreement score between the {0, 1} series of the is_pass field and the series of ratings given by the LLMs.

In [None]:
from sklearn.metrics import cohen_kappa_score

In [None]:
bool_kappa = []
for col in [c for c in judgments.columns if '_rating' in c]:
    judgments_kappa = judgments.loc[judgments[col] != '-']
    score = cohen_kappa_score(judgments_kappa['is_pass'], judgments_kappa[col].astype(int))
    bool_kappa.append('{0:.2f}'.format(score, judgments_kappa.shape[0]))
    print(col, ':', score, judgments_kappa.shape)

# Statistical tests

In [None]:
# write csv for statistical analysis
# P-VALUE LLM 
# - self judgement VS judgement of all other LLMs
# - self judgement VS judgement of all other LLMs not belonging to the same family
# - self judgement VS judgement of human witten functions

for col in [c for c in judgments.columns if '_rating' in c]:
    judgments_temp = judgments.loc[judgments[col] != '-'][['generated_by', col, 'is_pass']]
    judge = col.split('_rating')[0]
    family_name = col.split('-')[0]

    temp = judgments_temp.loc[judgments_temp.generated_by == judge]
    itsown = np.array(temp[col] - temp.is_pass) # judjements that the model in judge as given to the candidates proposed by judge itself
    
    temp = judgments_temp.loc[(~judgments_temp.generated_by.str.contains(family_name)) & (judgments_temp.generated_by != 'human_written')]
    all_but_family = np.array(temp[col] - temp.is_pass) # judjements that the model in judge as given to the candidates proposed by all the other LLMs not belonging to its family
    
    temp = judgments_temp.loc[(judgments_temp.generated_by != judge) & (judgments_temp.generated_by != 'human_written')]
    all_others = np.array(temp[col] - temp.is_pass) # judjements that the model in judge as given to the candidates proposed by all the other LLMs
    
    temp = judgments_temp.loc[judgments_temp.generated_by == 'human_written']
    human = np.array(temp[col] - temp.is_pass) # judjements that the model in judge as given to the target methods
    while itsown.shape[0] < all_others.shape[0]:
        itsown = np.concatenate((itsown, np.array([np.nan])))
    while all_but_family.shape[0] < all_others.shape[0]:
        all_but_family = np.concatenate((all_but_family, np.array([np.nan])))
    while human.shape[0] < all_others.shape[0]:
        human = np.concatenate((human, np.array([np.nan])))
    
    pd.DataFrame({
        'Model' : itsown,
        'all_LLM' : all_others,
        'all_Minus' : all_but_family,
        'humans' : human
    }).to_csv(f'../../2_llms_as_judge/results/tse/cg_judgement/Ranalysis/{judge}_judgments-vs-others.csv', index = False)

# Self bias

In [None]:
models = ['deepseek-coder-1.3b-instruct', 'deepseek-coder-6.7b-instruct', 'deepseek-coder-33b-instruct', 'CodeLlama-7b-Instruct-hf', 'CodeLlama-13b-Instruct-hf', 'CodeLlama-34b-Instruct-hf', 'gpt-3.5-turbo', 'gpt-4-turbo', 'human_written']
rating_cols = [c for c in judgments.columns if '_rating' in c]
battle_ship = np.zeros((len(rating_cols), len(models)))

for row, judge in enumerate(rating_cols):
    for col, candidate in enumerate(models):
        bool_temp = judgments.loc[(judgments.generated_by == candidate) & (judgments[judge] != '-')]
        battle_ship[row, col] = (bool_temp[judge] - bool_temp.is_pass).sum() / bool_temp.shape[0]

d = {k : v for k, v in zip(models, battle_ship)}
df = pd.DataFrame(d).T
df.columns = models

df.columns = MODEL_NAMES_SHORT + ['Human Written']
df.index = MODEL_NAMES_SHORT
print(df.to_latex(index = True, float_format = "{:.2f}".format))