# Download data

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install --upgrade gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [None]:
!gdown 'https://drive.google.com/uc?id=1JRYLPp1_egOtci4CTSyKPOpgkgOZhXIb'

Downloading...
From: https://drive.google.com/uc?id=1JRYLPp1_egOtci4CTSyKPOpgkgOZhXIb
To: /content/data_v4.csv
  0% 0.00/18.2k [00:00<?, ?B/s]100% 18.2k/18.2k [00:00<00:00, 18.8MB/s]


# Load data

In [None]:
import pandas as pd

data = pd.read_csv('data_v4.csv')
data['ModelID'] = data['ModelID'].map(lambda x: x.lower().replace(" ", ""))
data['ModelElement'] = data['ModelElement'].map(lambda x: 'Relationship' if x == 'Association' else x)
data_to_lm = data
data.head(10)

Unnamed: 0,Setting,ModelElement,precision,recall,f1,ModelID,LLM
0,0shot,Class,0.9375,0.46875,0.625,labtracker,GPT4
1,0shot,Attribute,0.90741,0.45349,0.60475,labtracker,GPT4
2,0shot,Relationship,0.5,0.15909,0.24138,labtracker,GPT4
3,0shot,Class,0.9375,0.57692,0.71429,celo,GPT4
4,0shot,Attribute,0.80357,0.80435,0.80396,celo,GPT4
5,0shot,Relationship,0.33333,0.15,0.2069,celo,GPT4
6,0shot,Class,0.85,0.46875,0.60427,tss,GPT4
7,0shot,Attribute,0.92857,0.66667,0.77612,tss,GPT4
8,0shot,Relationship,0.42857,0.225,0.29508,tss,GPT4
9,0shot,Class,0.9,0.63043,0.74148,shas,GPT4


In [None]:
len(data)

360

In [None]:
data = pd.melt(data, id_vars=['Setting', 'ModelElement', 'ModelID', 'LLM'], value_vars = ['precision', 'recall', 'f1'])
data.head(10)

Unnamed: 0,Setting,ModelElement,ModelID,LLM,variable,value
0,0shot,Class,labtracker,GPT4,precision,0.9375
1,0shot,Attribute,labtracker,GPT4,precision,0.90741
2,0shot,Relationship,labtracker,GPT4,precision,0.5
3,0shot,Class,celo,GPT4,precision,0.9375
4,0shot,Attribute,celo,GPT4,precision,0.80357
5,0shot,Relationship,celo,GPT4,precision,0.33333
6,0shot,Class,tss,GPT4,precision,0.85
7,0shot,Attribute,tss,GPT4,precision,0.92857
8,0shot,Relationship,tss,GPT4,precision,0.42857
9,0shot,Class,shas,GPT4,precision,0.9


In [None]:
cols = ['Setting', 'ModelElement', 'LLM', 'variable']
data['group'] = data[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
data

Unnamed: 0,Setting,ModelElement,ModelID,LLM,variable,value,group
0,0shot,Class,labtracker,GPT4,precision,0.93750,0shot_Class_GPT4_precision
1,0shot,Attribute,labtracker,GPT4,precision,0.90741,0shot_Attribute_GPT4_precision
2,0shot,Relationship,labtracker,GPT4,precision,0.50000,0shot_Relationship_GPT4_precision
3,0shot,Class,celo,GPT4,precision,0.93750,0shot_Class_GPT4_precision
4,0shot,Attribute,celo,GPT4,precision,0.80357,0shot_Attribute_GPT4_precision
...,...,...,...,...,...,...,...
1075,CoT,Attribute,tileo,CHAT,f1,0.16901,CoT_Attribute_CHAT_f1
1076,CoT,Relationship,tileo,CHAT,f1,0.02778,CoT_Relationship_CHAT_f1
1077,CoT,Class,hbms,CHAT,f1,0.48276,CoT_Class_CHAT_f1
1078,CoT,Attribute,hbms,CHAT,f1,0.40206,CoT_Attribute_CHAT_f1


# Basic statistics

In [None]:
from tabulate import tabulate
import numpy as np


metrics = ['precision', 'recall', 'f1']
for model_element in ['Class', 'Relationship', 'Attribute']:
  for llm in ['GPT3', 'CHAT', 'GPT4']:
    data_table = []
    for setting in ['0shot', '1shot_BTMS', '1shot_H2S', '2shots', 'CoT']:
      in_table = [setting]
      for metric in metrics:
        filter_data = data[(data['LLM'] == llm) & (data['ModelElement'] == model_element) & (data['Setting'] == setting) & (data['variable'] == metric)]
        in_table.append(f"{np.mean(filter_data.value):.4f} +- {np.std(filter_data.value):.4f}")
      data_table.append(in_table)

    col_names = ["Setting"] + metrics
      
    print(f'Table llm {llm} in {model_element}')
    print(tabulate(data_table, headers=col_names))
    print('-'*55)
    print()

Table llm GPT3 in Class
Setting     precision         recall            f1
----------  ----------------  ----------------  ----------------
0shot       0.8871 +- 0.0959  0.3920 +- 0.0641  0.5381 +- 0.0660
1shot_BTMS  0.9561 +- 0.0587  0.4757 +- 0.0858  0.6311 +- 0.0785
1shot_H2S   0.9467 +- 0.0637  0.4623 +- 0.0686  0.6170 +- 0.0653
2shots      0.9129 +- 0.0595  0.5017 +- 0.0649  0.6432 +- 0.0482
CoT         0.8931 +- 0.1052  0.4706 +- 0.0648  0.6138 +- 0.0698
-------------------------------------------------------

Table llm CHAT in Class
Setting     precision         recall            f1
----------  ----------------  ----------------  ----------------
0shot       0.8723 +- 0.0897  0.4061 +- 0.1122  0.5463 +- 0.1101
1shot_BTMS  0.9472 +- 0.0506  0.5148 +- 0.1328  0.6560 +- 0.1171
1shot_H2S   0.8518 +- 0.0927  0.5622 +- 0.0925  0.6712 +- 0.0769
2shots      0.8716 +- 0.0547  0.5638 +- 0.0919  0.6807 +- 0.0750
CoT         0.8787 +- 0.1291  0.4434 +- 0.0712  0.5848 +- 0.0792
-------------

In [None]:
!pip install pingouin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pingouin
  Downloading pingouin-0.5.3-py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting outdated
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting pandas-flavor>=0.2.0
  Downloading pandas_flavor-0.5.0-py3-none-any.whl (7.1 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25l[?25hdone
  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7048 sha256=f7d63a25e551362f2232b6033b1061c2c66deb001c6e0b9b6b4c0376e170c065
  Stored in directory: /root/.cache/pip/wheels/04/bb/0d/2d02ec45f29c48d6192476bfb59c5a0e64b605e7212374dd15
Successfully built littleut

In [None]:
import pingouin as pg

normality_check = pg.normality(data, group='group', dv='value', method='normaltest')
normality_check[normality_check['normal'] == False]

Unnamed: 0_level_0,W,pval,normal
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2shots_Class_GPT4_precision,9.767185,0.00757,False
2shots_Class_GPT3_precision,6.56093,0.037611,False
1shot_BTMS_Class_GPT4_recall,7.276635,0.026297,False
0shot_Relationship_CHAT_recall,6.072912,0.048005,False
1shot_BTMS_Attribute_CHAT_recall,10.938458,0.004214,False
1shot_H2S_Attribute_CHAT_recall,7.942882,0.018846,False
1shot_H2S_Relationship_CHAT_recall,9.682129,0.007899,False
CoT_Attribute_CHAT_recall,9.802368,0.007438,False
0shot_Relationship_CHAT_f1,6.603644,0.036816,False
1shot_BTMS_Attribute_CHAT_f1,7.653858,0.021776,False


# RQ1: Class vs Attribute vs Relationship

In [None]:
metric = 'f1'
for llm in data['LLM'].unique():
  for setting in data['Setting'].unique():
      dataframe = data[(data['LLM'] == llm)
          & (data['variable'] == metric) & (data['Setting'] == setting)]
      p = pg.friedman(dv='value', subject='ModelID', data=dataframe, within='ModelElement')['p-unc'].values[0]
      if p < 0.05:
        print(llm, setting)

GPT4 0shot
GPT4 1shot_BTMS
GPT4 1shot_H2S
GPT4 2shots
GPT4 CoT
GPT3 0shot
GPT3 1shot_BTMS
GPT3 1shot_H2S
GPT3 2shots
GPT3 CoT
CHAT 0shot
CHAT 1shot_BTMS
CHAT 1shot_H2S
CHAT 2shots
CHAT CoT


In [None]:
llm = 'CHAT'
metric = 'f1'


dataframe = data[(data['variable'] == metric) 
    & (data['LLM'] == llm)]

all_contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, 
                                  within=['Setting', 'ModelElement'], padjust='None', 
                                  parametric=False, return_desc=True)
all_contrasts = all_contrasts[all_contrasts['Contrast']=='Setting * ModelElement']
all_contrasts

Unnamed: 0,Contrast,Setting,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,W-val,alternative,p-unc,hedges
13,Setting * ModelElement,0shot,Attribute,Class,0.430229,0.213614,0.54626,0.117653,True,False,6.0,two-sided,0.109375,-0.636164
14,Setting * ModelElement,0shot,Attribute,Relationship,0.430229,0.213614,0.135423,0.06367,True,False,1.0,two-sided,0.015625,1.768399
15,Setting * ModelElement,0shot,Class,Relationship,0.54626,0.117653,0.135423,0.06367,True,False,0.0,two-sided,0.007812,4.106253
16,Setting * ModelElement,1shot_BTMS,Attribute,Class,0.418988,0.181731,0.655974,0.125177,True,False,1.0,two-sided,0.015625,-1.435938
17,Setting * ModelElement,1shot_BTMS,Attribute,Relationship,0.418988,0.181731,0.242005,0.153156,True,False,0.0,two-sided,0.007812,0.995698
18,Setting * ModelElement,1shot_BTMS,Class,Relationship,0.655974,0.125177,0.242005,0.153156,True,False,0.0,two-sided,0.007812,2.798275
19,Setting * ModelElement,1shot_H2S,Attribute,Class,0.42933,0.148564,0.671234,0.082258,True,False,1.0,two-sided,0.015625,-1.90466
20,Setting * ModelElement,1shot_H2S,Attribute,Relationship,0.42933,0.148564,0.244146,0.071967,True,False,0.0,two-sided,0.007812,1.499929
21,Setting * ModelElement,1shot_H2S,Class,Relationship,0.671234,0.082258,0.244146,0.071967,True,False,0.0,two-sided,0.007812,5.224775
22,Setting * ModelElement,2shots,Attribute,Class,0.455264,0.121652,0.680716,0.08019,True,False,0.0,two-sided,0.007812,-2.068901


In [None]:
from plotnine import *

llm = 'GPT4'
metric = 'f1'
setting = '1shot_BTMS'


dataframe = data[(data['variable'] == metric) 
    & (data['LLM'] == llm) & (data['Setting'] == setting)]
dataframe_cat = pd.Categorical(dataframe['ModelElement'], categories=["Class", "Attribute", "Relationship"])
# assign to a new column in the DataFrame
dataframe = dataframe.assign(dataframe_cat = dataframe_cat)
plot = (
    ggplot(dataframe)
    + geom_boxplot(aes(x='dataframe_cat', y='value'))
    +  xlab("Type of modeling element") + ylab("F1 score") + theme_classic()
    + theme(text=element_text(size=14))
)

plot.save(f'rq1_{llm}_{setting}.pdf', dpi=600)

# RQ1: Precision vs Recall

In [None]:
list_contrasts = []
for llm in data['LLM'].unique():
  for modelling_element in data['ModelElement'].unique():
    dataframe = data[(data['ModelElement'] == modelling_element) 
        & (data['LLM'] == llm) & ((data['variable'] != 'f1'))]

    all_contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, 
                                      within=['Setting', 'variable'], padjust='None', 
                                      parametric=False, return_desc=True)

    all_contrasts = all_contrasts[all_contrasts['Contrast'] == 'Setting * variable']
    all_contrasts['LLM'] = llm
    all_contrasts['ModelElement'] = modelling_element
    list_contrasts.append(all_contrasts)

all_contrasts = pd.concat(list_contrasts)
for m in data['ModelElement'].unique():
  m_contrasts = all_contrasts[all_contrasts['ModelElement'] == m]
  signif = len(m_contrasts[m_contrasts['p-unc']< 0.05])
  print(f'In {m}: {signif}/{len(m_contrasts)}')

In Class: 15/15
In Attribute: 9/15
In Relationship: 15/15


In [None]:
llm = 'GPT4'
setting = '1shot_BTMS'


dataframe = data[(data['LLM'] == llm) & (data['Setting'] == setting)]
dataframe_cat = pd.Categorical(dataframe['ModelElement'], categories=["Class", "Attribute", "Relationship"])
dataframe = dataframe.assign(dataframe_cat = dataframe_cat)
plot = (
    ggplot(dataframe)
    + geom_boxplot(aes(x='dataframe_cat', y='value', color='variable'))
    +  xlab("Type of modeling element") + ylab("Score") + scale_colour_discrete(name="Score type") + theme_classic()
    + theme(text=element_text(size=18))

)
plot.save(f'rq1_prec_recall_{llm}_{setting}.pdf', dpi=600)

# RQ2: 0-shot vs few-shot

In [None]:
from statsmodels.stats.multitest import multipletests

metric = 'f1'
llm = 'GPT4'

all_contrasts = []
for model_element in ['Class', 'Relationship', 'Attribute']:
  dataframe = data[(data['LLM'] == llm) 
    & (data['ModelElement'] == model_element)
    & (data['variable'] == metric)]
  contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, within=['Setting'], padjust='None', parametric=False,
                                return_desc=True)
  contrasts['ModelElement'] = model_element
  all_contrasts.append(contrasts[(contrasts['A'] == '0shot') & (contrasts['B'] != 'CoT')])
all_contrasts = pd.concat(all_contrasts)
all_contrasts['p-cor'] = multipletests(all_contrasts['p-unc'], method='fdr_by')[1]
all_contrasts

Unnamed: 0,Contrast,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,W-val,alternative,p-unc,hedges,ModelElement,p-cor
0,Setting,0shot,1shot_BTMS,0.670466,0.046617,0.756841,0.065879,True,False,2.0,two-sided,0.023438,-1.431029,Class,0.596735
1,Setting,0shot,1shot_H2S,0.670466,0.046617,0.696866,0.058785,True,False,11.0,two-sided,0.382812,-0.470491,Class,1.0
2,Setting,0shot,2shots,0.670466,0.046617,0.746986,0.056721,True,False,5.0,two-sided,0.078125,-1.393532,Class,0.663039
0,Setting,0shot,1shot_BTMS,0.251489,0.051596,0.33643,0.089667,True,False,4.0,two-sided,0.054688,-1.097833,Relationship,0.663039
1,Setting,0shot,1shot_H2S,0.251489,0.051596,0.329471,0.152883,True,False,9.0,two-sided,0.25,-0.646207,Relationship,1.0
2,Setting,0shot,2shots,0.251489,0.051596,0.335604,0.147422,True,False,7.0,two-sided,0.148438,-0.72007,Relationship,0.944831
0,Setting,0shot,1shot_BTMS,0.565516,0.154973,0.582274,0.172107,True,False,15.0,two-sided,0.742188,-0.096746,Attribute,1.0
1,Setting,0shot,1shot_H2S,0.565516,0.154973,0.584533,0.194191,True,False,14.0,two-sided,0.640625,-0.10234,Attribute,1.0
2,Setting,0shot,2shots,0.565516,0.154973,0.607103,0.136548,True,False,10.0,two-sided,0.3125,-0.269207,Attribute,1.0


In [None]:
p_col = 'p-unc'
data_table = []
for setting in ['1shot_BTMS', '1shot_H2S', '2shots']:
  row = [setting]
  for model_element in ['Class', 'Attribute', 'Relationship']:
    row_df = all_contrasts[(all_contrasts['B']==setting) & (all_contrasts['ModelElement']==model_element)]
    p_val = row_df[p_col].values[0]
    h = row_df['mean(B)'].values[0] - row_df['mean(A)'].values[0]
    row += [f'{p_val:.4f} / {h:.4f}']
  data_table.append(row)

col_names = ["Setting"] + ['Class', 'Attribute', 'Relationship']
print(tabulate(data_table, headers=col_names))

Setting     Class            Attribute        Relationship
----------  ---------------  ---------------  ---------------
1shot_BTMS  0.0234 / 0.0864  0.7422 / 0.0168  0.0547 / 0.0849
1shot_H2S   0.3828 / 0.0264  0.6406 / 0.0190  0.2500 / 0.0780
2shots      0.0781 / 0.0765  0.3125 / 0.0416  0.1484 / 0.0841


In [None]:
print(tabulate(data_table, headers=col_names, tablefmt='latex'))

\begin{tabular}{llll}
\hline
 Setting    & Class           & Attribute       & Relationship    \\
\hline
 1shot\_BTMS & 0.0234 / 0.0864 & 0.7422 / 0.0168 & 0.0547 / 0.0849 \\
 1shot\_H2S  & 0.3828 / 0.0264 & 0.6406 / 0.0190 & 0.2500 / 0.0780 \\
 2shots     & 0.0781 / 0.0765 & 0.3125 / 0.0416 & 0.1484 / 0.0841 \\
\hline
\end{tabular}


# RQ2: 1shot vs 2shot

In [None]:
metric = 'f1'
llm = 'CHAT'

all_contrasts = []
for model_element in ['Class', 'Relationship', 'Attribute']:
  dataframe = data[(data['LLM'] == llm) 
    & (data['ModelElement'] == model_element)
    & (data['variable'] == metric)]
  contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, within=['Setting'], padjust='None', parametric=False)
  contrasts['ModelElement'] = model_element
  all_contrasts.append(contrasts[(((contrasts['A'] == '1shot_BTMS') & (contrasts['B'] == '2shots'))
  | ((contrasts['A'] == '1shot_H2S') & (contrasts['B'] == '2shots')))])
all_contrasts = pd.concat(all_contrasts)
all_contrasts['p-cor'] = multipletests(all_contrasts['p-unc'], method='fdr_by')[1]
all_contrasts

Unnamed: 0,Contrast,A,B,Paired,Parametric,W-val,alternative,p-unc,hedges,ModelElement,p-cor
5,Setting,1shot_BTMS,2shots,True,False,16.0,two-sided,0.84375,-0.222539,Class,1.0
7,Setting,1shot_H2S,2shots,True,False,14.0,two-sided,0.640625,-0.110368,Class,1.0
5,Setting,1shot_BTMS,2shots,True,False,12.0,two-sided,0.460938,-0.324252,Relationship,1.0
7,Setting,1shot_H2S,2shots,True,False,11.0,two-sided,0.382812,-0.446633,Relationship,1.0
5,Setting,1shot_BTMS,2shots,True,False,10.0,two-sided,0.3125,-0.221794,Attribute,1.0
7,Setting,1shot_H2S,2shots,True,False,14.0,two-sided,0.640625,-0.180585,Attribute,1.0


# RQ2: COT

In [None]:
metric = 'f1'
llm = 'CHAT'

all_contrasts = []
for model_element in ['Class', 'Relationship', 'Attribute']:
  dataframe = data[(data['LLM'] == llm) 
    & (data['ModelElement'] == model_element)
    & (data['variable'] == metric)]
  contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, within=['Setting'], padjust='None', parametric=False, return_desc=True)
  contrasts['ModelElement'] = model_element
  all_contrasts.append(contrasts[(contrasts['A'] == '1shot_H2S') & (contrasts['B'] == 'CoT')])
all_contrasts = pd.concat(all_contrasts)
all_contrasts['p-cor'] = multipletests(all_contrasts['p-unc'], method='fdr_by')[1]
all_contrasts['mean_diff'] = all_contrasts['mean(B)'] - all_contrasts['mean(A)']
all_contrasts.round(4)

Unnamed: 0,Contrast,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,W-val,alternative,p-unc,hedges,ModelElement,p-cor,mean_diff
8,Setting,1shot_H2S,CoT,0.6712,0.0823,0.5848,0.0846,True,False,0.0,two-sided,0.0078,0.9797,Class,0.043,-0.0865
8,Setting,1shot_H2S,CoT,0.2441,0.072,0.1746,0.1206,True,False,5.0,two-sided,0.0781,0.6621,Relationship,0.2148,-0.0695
8,Setting,1shot_H2S,CoT,0.4293,0.1486,0.4569,0.2021,True,False,11.0,two-sided,0.3828,-0.1467,Attribute,0.7018,0.0275


# RQ 3: Best model

In [None]:
metric = 'f1'
for setting in data['Setting'].unique():
  for model_element in data['ModelElement'].unique():
    dataframe = data[(data['ModelElement'] == model_element)
        & (data['variable'] == metric) & (data['Setting'] == setting)]
    p = pg.friedman(dv='value', subject='ModelID', data=dataframe, within='LLM')['p-unc'].values[0]
    if p < 0.05:
      print(model_element, setting)

Class 0shot
Attribute 0shot
Relationship 0shot
Attribute 1shot_BTMS
Class 2shots
Attribute 2shots


In [None]:
metric = 'f1'
all_contrasts = []
for model_element in ['Class', 'Relationship', 'Attribute']:
  dataframe = data[(data['ModelElement'] == model_element)
    & (data['variable'] == metric)]
  contrasts = pg.pairwise_tests(dv='value', subject='ModelID', data=dataframe, within=['Setting', 'LLM'], padjust='None', 
                                parametric=False, return_desc=True)
  contrasts = contrasts[contrasts['Contrast']=='Setting * LLM']
  contrasts['ModelElement'] = model_element
  all_contrasts.append(contrasts)
all_contrasts = pd.concat(all_contrasts)
all_contrasts['p-cor'] = multipletests(all_contrasts['p-unc'], method='fdr_by')[1]
all_contrasts

Unnamed: 0,Contrast,Setting,A,B,mean(A),std(A),mean(B),std(B),Paired,Parametric,W-val,alternative,p-unc,hedges,ModelElement,p-cor
13,Setting * LLM,0shot,CHAT,GPT3,0.54626,0.117653,0.538115,0.070558,True,False,15.0,two-sided,0.742188,0.079383,Class,1.0
14,Setting * LLM,0shot,CHAT,GPT4,0.54626,0.117653,0.670466,0.046617,True,False,3.0,two-sided,0.039062,-1.312288,Class,0.482843
15,Setting * LLM,0shot,GPT3,GPT4,0.538115,0.070558,0.670466,0.046617,True,False,1.0,two-sided,0.015625,-2.09258,Class,0.30902
16,Setting * LLM,1shot_BTMS,CHAT,GPT3,0.655974,0.125177,0.631059,0.083903,True,False,11.0,two-sided,0.672604,0.221064,Class,1.0
17,Setting * LLM,1shot_BTMS,CHAT,GPT4,0.655974,0.125177,0.756841,0.065879,True,False,8.0,two-sided,0.195312,-0.953438,Class,1.0
18,Setting * LLM,1shot_BTMS,GPT3,GPT4,0.631059,0.083903,0.756841,0.065879,True,False,2.0,two-sided,0.023438,-1.576561,Class,0.356561
19,Setting * LLM,1shot_H2S,CHAT,GPT3,0.671234,0.082258,0.616999,0.069853,True,False,4.0,two-sided,0.054688,0.67197,Class,0.600872
20,Setting * LLM,1shot_H2S,CHAT,GPT4,0.671234,0.082258,0.696866,0.058785,True,False,15.0,two-sided,0.742188,-0.338983,Class,1.0
21,Setting * LLM,1shot_H2S,GPT3,GPT4,0.616999,0.069853,0.696866,0.058785,True,False,2.0,two-sided,0.023438,-1.169691,Class,0.356561
22,Setting * LLM,2shots,CHAT,GPT3,0.680716,0.08019,0.643175,0.051553,True,False,11.0,two-sided,0.382812,0.526534,Class,1.0


In [None]:
metric = 'f1'
data_table = []
for setting in ['0shot', '1shot_BTMS', '1shot_H2S', '2shots', 'CoT']:
  row = [setting]
  for model_element in ['Class', 'Attribute', 'Relationship']:
    filter_data = data[(data['ModelElement'] == model_element) & (data['Setting'] == setting) & (data['variable'] == metric)]
    means = filter_data.groupby(['LLM'], as_index=False).mean()
    best_model = means.max().values[0]
    row += [best_model]
  data_table.append(row)
col_names = ["Setting"] + ['Class', 'Attribute', 'Relationship']
print()
print(tabulate(data_table, headers=col_names))


Setting     Class    Attribute    Relationship
----------  -------  -----------  --------------
0shot       GPT4     GPT4         GPT4
1shot_BTMS  GPT4     GPT4         GPT4
1shot_H2S   GPT4     GPT4         GPT4
2shots      GPT4     GPT4         GPT4
CoT         GPT4     GPT4         GPT4


In [None]:
print(tabulate(data_table, headers=col_names, tablefmt='latex'))

\begin{tabular}{llll}
\hline
 Setting    & Class   & Attribute   & Relationship   \\
\hline
 0shot      & GPT4    & GPT4        & GPT4           \\
 1shot\_BTMS & GPT4    & GPT4        & GPT4           \\
 1shot\_H2S  & GPT4    & GPT4        & GPT4           \\
 2shots     & GPT4    & GPT4        & GPT4           \\
 CoT        & GPT4    & GPT4        & GPT4           \\
\hline
\end{tabular}
