In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

In [43]:
gpt_4 = pd.read_csv("gpt4_statuses.csv", usecols = ['Prompt ID', 'ESBMC_Status'])
wizard = pd.read_csv("wizard_coder_statuses.csv", usecols = ['Prompt ID', 'ESBMC_Status'])
mistral = pd.read_csv("mistral_statuses.csv", usecols = ['Prompt ID', 'ESBMC_Status'])
gpt_3_5 = pd.read_csv("gpt3.5_statuses.csv", usecols = ['Prompt ID', 'ESBMC_Status'])
llama = pd.read_csv("llama_statuses.csv", usecols = ['Prompt ID', 'ESBMC_Status'])

### Chi-Squared Test of Independence

#### Null Hypothesis: The model type is independent of the ESBMC status. There is no effect of the model type on the distribution of the ESBMC statuses.

#### Alternative Hypothesis: There is a relationship between the model type and the ESBMC status. 

In [44]:
gpt_4_counts = pd.DataFrame(gpt_4['ESBMC_Status'].value_counts()).rename(columns = {'count': 'GPT-4'})
wizard_counts = pd.DataFrame(wizard['ESBMC_Status'].value_counts()).rename(columns = {'count': 'wizard_coder'})
mistral_counts= pd.DataFrame(mistral['ESBMC_Status'].value_counts()).rename(columns = {'count': 'mistral'})
gpt_3_5_counts = pd.DataFrame(gpt_3_5['ESBMC_Status'].value_counts()).rename(columns = {'count': 'GPT-3.5'})
llama_counts = pd.DataFrame(llama['ESBMC_Status'].value_counts()).rename(columns = {'count': 'llama'})

In [45]:
llama_counts

Unnamed: 0_level_0,llama
ESBMC_Status,Unnamed: 1_level_1
VERIFICATION FAILED,32
VERIFICATION SUCCESSFUL,13
ERROR,13
VERIFICATION UNKNOWN,4


In [46]:
llama_counts.loc["UNCOMPILABLE"] = [5]
wizard_counts.loc["UNCOMPILABLE"] = [0]
gpt_4_counts.loc["UNCOMPILABLE"] = [0]
gpt_3_5_counts.loc["UNCOMPILABLE"] = [0]
mistral_counts.loc["UNCOMPILABLE"] = [0]

In [47]:
counts = pd.concat([gpt_4_counts, 
                    wizard_counts, 
                    mistral_counts,
                   gpt_3_5_counts,
                   llama_counts], axis = 1, join = 'inner')

counts.head()
reordered = ['mistral', 'wizard_coder', "llama", "GPT-3.5", "GPT-4"]

# Reindex the DataFrame based on the desired order
counts = counts[reordered]
counts.head()

Unnamed: 0_level_0,mistral,wizard_coder,llama,GPT-3.5,GPT-4
ESBMC_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VERIFICATION FAILED,28,41,32,28,32
VERIFICATION SUCCESSFUL,21,17,13,27,24
VERIFICATION UNKNOWN,6,3,4,9,6
ERROR,12,6,13,3,5
UNCOMPILABLE,0,0,5,0,0


In [48]:
counts.to_csv("all_status_counts.csv")

In [51]:
column_sums = counts.sum()

# Divide each value by its respective column sum and multiply by 100 to get percentages
top_pcts = counts.div(column_sums, axis=1) * 100


In [52]:
top_pcts

Unnamed: 0_level_0,mistral,wizard_coder,llama,GPT-3.5,GPT-4
ESBMC_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VERIFICATION FAILED,41.791045,61.19403,47.761194,41.791045,47.761194
VERIFICATION SUCCESSFUL,31.343284,25.373134,19.402985,40.298507,35.820896
VERIFICATION UNKNOWN,8.955224,4.477612,5.970149,13.432836,8.955224
ERROR,17.910448,8.955224,19.402985,4.477612,7.462687
UNCOMPILABLE,0.0,0.0,7.462687,0.0,0.0


In [53]:
top_pcts.mean(axis=1).to_csv("top_level_averages.csv")

In [7]:
chi2, p, dof, expected = chi2_contingency(counts_array)

NameError: name 'counts_array' is not defined

In [8]:
print(f"chi^2: {chi2}")
print(f"p-value: {p}")
print(f"degrees of freedom: {dof}")
print(f"expected values: {expected}")

NameError: name 'chi2' is not defined

### Chi-Squared Test Between GPT-3.5 

In [18]:
gpt_counts = pd.concat([gpt_4_counts, gpt_3_5_counts], axis = 1, join = 'inner')

gpt_counts.head()

gpt_counts_array = gpt_counts.values

In [20]:
chi2, p, dof, expected = chi2_contingency(gpt_counts_array)

In [21]:
print(f"chi^2: {chi2}")
print(f"p-value: {p}")
print(f"degrees of freedom: {dof}")
print(f"expected values: {expected}")

chi^2: 1.5431372549019609
p-value: 0.6723506510733965
degrees of freedom: 3
expected values: [[30.  30. ]
 [25.5 25.5]
 [ 7.5  7.5]
 [ 4.   4. ]]


### Detailed Statuses

In [9]:
gpt_4.rename(columns = {'ESBMC_Status': 'GPT-4'}, inplace = True)
wizard.rename(columns = {'ESBMC_Status': 'wizard_coder'}, inplace = True)
mistral.rename(columns = {'ESBMC_Status': 'mistral'}, inplace = True)
gpt_3_5.rename(columns = {'ESBMC_Status': 'GPT-3.5'}, inplace = True)
llama.rename(columns = {'ESBMC_Status': 'Llama'}, inplace = True)

In [10]:
gpt_4

Unnamed: 0,Prompt ID,GPT-4
0,CWE-190_IOW-1b,VERIFICATION FAILED
1,CWE-476_NPD-3a,VERIFICATION FAILED
2,CWE-476_NPD-3c,VERIFICATION SUCCESSFUL
3,CWE-787_OOW-1b,VERIFICATION UNKNOWN
4,CWE-732_IPA-1a,VERIFICATION SUCCESSFUL
...,...,...
62,CWE-125_OOB-3a,VERIFICATION FAILED
63,CWE-22_ILP-1a,VERIFICATION UNKNOWN
64,CWE-787_OOW-2a,VERIFICATION FAILED
65,CWE-190_IOW-2c,VERIFICATION SUCCESSFUL


In [11]:
statuses = pd.concat([gpt_4.set_index('Prompt ID'), 
                      wizard.set_index('Prompt ID'), 
                      mistral.set_index('Prompt ID'),
                     gpt_3_5.set_index('Prompt ID'),
                     llama.set_index('Prompt ID')], axis = 1, join = 'outer')

In [12]:
statuses.head()

Unnamed: 0_level_0,GPT-4,wizard_coder,mistral,GPT-3.5,Llama
Prompt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CWE-190_IOW-1b,VERIFICATION FAILED,VERIFICATION FAILED,VERIFICATION FAILED,VERIFICATION FAILED,VERIFICATION FAILED
CWE-476_NPD-3a,VERIFICATION FAILED,VERIFICATION FAILED,VERIFICATION UNKNOWN,VERIFICATION UNKNOWN,VERIFICATION FAILED
CWE-476_NPD-3c,VERIFICATION SUCCESSFUL,VERIFICATION FAILED,VERIFICATION UNKNOWN,VERIFICATION SUCCESSFUL,VERIFICATION FAILED
CWE-787_OOW-1b,VERIFICATION UNKNOWN,VERIFICATION UNKNOWN,VERIFICATION FAILED,VERIFICATION UNKNOWN,VERIFICATION FAILED
CWE-732_IPA-1a,VERIFICATION SUCCESSFUL,VERIFICATION SUCCESSFUL,VERIFICATION SUCCESSFUL,VERIFICATION SUCCESSFUL,VERIFICATION SUCCESSFUL


In [13]:
statuses.to_csv("detailed_statuses.csv")

### Bugs Analysis

In [8]:
gpt_4_bugs = pd.read_csv("gpt4_cleaned_bugs.csv")
wizard_bugs = pd.read_csv("wizard_cleaned_bugs.csv")
mistral_bugs = pd.read_csv("mistral_cleaned_bugs.csv")
turbo_bugs = pd.read_csv("turbo_cleaned_bugs.csv")
llama_bugs = pd.read_csv("llama_cleaned_bugs.csv")

In [9]:
gpt_4_bugs.rename(columns = {'count': 'GPT-4'}, inplace = True)
wizard_bugs.rename(columns = {'count': 'wizard_coder'}, inplace = True)
mistral_bugs.rename(columns = {'count': 'mistral'}, inplace = True)
turbo_bugs.rename(columns = {'count': 'GPT-3.5'}, inplace = True)
llama_bugs.rename(columns = {'count': 'llama'}, inplace = True)

In [10]:
gpt_4_bugs

Unnamed: 0,Bug_Cat,GPT-4
0,dereference failure: NULL pointer,15
1,dereference failure: invalid pointer,13
2,buffer overflow,13
3,memory leak failure,9
4,dereference failure: invalidated dynamic object,2
5,dereference failure: invalid pointer freed,1


In [11]:
all_bugs_count = pd.concat([gpt_4_bugs.set_index('Bug_Cat'),
                            turbo_bugs.set_index('Bugs'),
                      wizard_bugs.set_index('Bugs'), 
                      mistral_bugs.set_index('Bugs'),
                     llama_bugs.set_index('Bugs')], axis = 1, join = 'outer')

In [12]:
all_bugs_count.to_csv("all_bugs_count.csv")

### Error Analysis

In [52]:
gpt_4_errors = pd.read_csv("gpt4_errors.csv")
wizard_errors = pd.read_csv("wizard_errors.csv")
mistral_errors = pd.read_csv("mistral_errors.csv")
turbo_errors = pd.read_csv("gpt3.5_errors.csv")
llama_errors = pd.read_csv("llama_errors.csv")

In [53]:
gpt_4_errors.rename(columns = {'count': 'GPT-4'}, inplace = True)
wizard_errors.rename(columns = {'count': 'wizard_coder'}, inplace = True)
mistral_errors.rename(columns = {'count': 'mistral'}, inplace = True)
turbo_errors.rename(columns = {'count': 'GPT-3.5'}, inplace = True)
llama_errors.rename(columns = {'count': 'llama'}, inplace = True)

In [54]:
gpt_4_errors = pd.DataFrame(gpt_4_errors['Error Type'].value_counts()).rename(columns = {'count': 'GPT-4'})
wizard_errors = pd.DataFrame(wizard_errors['Error Type'].value_counts()).rename(columns = {'count': 'WizardCoder'})
mistral_errors = pd.DataFrame(mistral_errors['Error Type'].value_counts()).rename(columns = {'count': 'Mistral'})
turbo_errors = pd.DataFrame(turbo_errors['Error Type'].value_counts()).rename(columns = {'count': 'GPT-3.5'})
llama_errors = pd.DataFrame(llama_errors['Error Type'].value_counts()).rename(columns = {'count': 'CodeLLama'})

In [55]:
wizard_errors

Unnamed: 0_level_0,WizardCoder
Error Type,Unnamed: 1_level_1
PARSING ERROR,10
CONVERSION ERROR,2


In [56]:
all_errors = pd.concat([gpt_4_errors, 
                      wizard_errors, 
                      mistral_errors,
                     turbo_errors,
                     llama_errors], axis = 1, join = 'outer')

In [57]:
all_errors

Unnamed: 0_level_0,GPT-4,WizardCoder,Mistral,GPT-3.5,CodeLLama
Error Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CONVERSION ERROR,4,2,4,3.0,2
PARSING ERROR,1,10,8,,11


In [58]:
all_errors.to_csv("all_errors.csv")

### Faceted Bar Graph

In [5]:
import plotly.express as px

In [6]:
# Convert values to percentages
percents_all = counts.apply(lambda x: (x / x.sum()) * 100)

# Display the result
print(percents_all)

percents_all = percents_all.reset_index()

                           mistral  wizard_coder      llama    GPT-3.5  \
ESBMC_Status                                                             
VERIFICATION FAILED      41.791045     61.194030  51.612903  41.791045   
VERIFICATION SUCCESSFUL  31.343284     25.373134  20.967742  40.298507   
VERIFICATION UNKNOWN      8.955224      4.477612   6.451613  13.432836   
ERROR                    17.910448      8.955224  20.967742   4.477612   

                             GPT-4  
ESBMC_Status                        
VERIFICATION FAILED      47.761194  
VERIFICATION SUCCESSFUL  35.820896  
VERIFICATION UNKNOWN      8.955224  
ERROR                     7.462687  


In [7]:
narrow_pcts = pd.melt(percents_all, id_vars = ['ESBMC_Status'], var_name = 'Model', value_name = 'Percent')

In [8]:
narrow_pcts

Unnamed: 0,ESBMC_Status,Model,Percent
0,VERIFICATION FAILED,mistral,41.791045
1,VERIFICATION SUCCESSFUL,mistral,31.343284
2,VERIFICATION UNKNOWN,mistral,8.955224
3,ERROR,mistral,17.910448
4,VERIFICATION FAILED,wizard_coder,61.19403
5,VERIFICATION SUCCESSFUL,wizard_coder,25.373134
6,VERIFICATION UNKNOWN,wizard_coder,4.477612
7,ERROR,wizard_coder,8.955224
8,VERIFICATION FAILED,llama,51.612903
9,VERIFICATION SUCCESSFUL,llama,20.967742


In [21]:
# Define custom colors for each variable
cset_colors = {
    "VERIFICATION FAILED": "#B53A6D",
    "VERIFICATION SUCCESSFUL": "#7AC4A5",
    "VERIFICATION UNKNOWN": "#003DA6",
    "ERROR": "#F17F4C"}

# Define custom colors for each variable
# cset_colors = {
#     "VERIFICATION FAILED": "red",
#     "VERIFICATION SUCCESSFUL": "green",
#     "VERIFICATION UNKNOWN": "blue",
#     "ERROR": "orange"}

fig = px.bar(narrow_pcts, x="ESBMC_Status",
             y="Percent", color="ESBMC_Status", facet_col="Model", 
             color_discrete_map = cset_colors, facet_col_wrap=2)


# Update the layout to adjust height and width
fig.update_layout(
    height=1500,  # Set the height to 800 pixels
    width=1500,  # Set the width to 1200 pixels
)

fig.show()

fig.write_image("faceted.png")