In [None]:
import pandas as pd
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)


In [2]:
def group_by_param(df, params):
    all_cols = ['Case', 'Model Type', 'Binary/Weighted']
    no_param_cols = [col for col in all_cols if col not in params]
    df_copy = df.copy()
    df_copy = df_copy.drop(columns=no_param_cols)
    grouped_stats = df_copy.groupby(params).agg(['mean', 'std'])
    
    # Round to 3 decimal places
    grouped_stats = grouped_stats.round(3)
    
    return grouped_stats

In [None]:
aux = "'all phrases'"

remove = [6, 11, 15, 19, 23]

fb_df = pd.read_csv(f'results/extract_dialogue/metrics/fb {aux}.csv')
fb_df = fb_df.drop([i for i in fb_df.index if fb_df.iloc[i]['Case'] in remove])
fb_df = group_by_param(fb_df, ['Model Type', 'Binary/Weighted'])
fb_df

In [None]:
behavior_df = pd.read_csv(f'results/extract_dialogue/metrics/behavior {aux} clustering_definition.csv')
behavior_df

In [None]:
behavior_df = group_by_param(behavior_df, ['Model Type', 'Binary/Weighted'])
# behavior_df[[col for col in behavior_df.columns if 'verb' in col]]

for r_t in ['r_t_beh', 'r_t_verb']:
    display(behavior_df[[
        (f'AUROC {r_t}', 'mean'), (f'AUROC {r_t}', 'std'),
        (f'Precision {r_t}', 'mean'), (f'Precision {r_t}', 'std'),
        (f'Recall {r_t}', 'mean'), (f'Recall {r_t}', 'std'),
        (f'F1 {r_t}', 'mean'), (f'F1 {r_t}', 'std'),
    ]])

In [None]:
component_df = pd.read_csv(f'results/extract_dialogue/metrics/component {aux} human_annotations.csv')
component_df = group_by_param(component_df, ['Model Type', 'Binary/Weighted'])

for f in ['f_anatomic', 'f_procedural', 'f_technical']:
    display(component_df[[
        (f'AUROC {f}', 'mean'), (f'AUROC {f}', 'std'),
        (f'Precision {f}', 'mean'), (f'Precision {f}', 'std'),
        (f'Recall {f}', 'mean'), (f'Recall {f}', 'std'),
        (f'F1 {f}', 'mean'), (f'F1 {f}', 'std'),
    ]])

In [None]:
columns=['name', 'AUROC', 'Precision (binary)', 'Precision (weighted)', 'Recall (binary)', 'Recall (weighted)', 'F1 (binary)', 'F1 (weighted)']
fb_results = pd.DataFrame(columns=columns)
r_t_beh_results = pd.DataFrame(columns=columns)
r_t_verb_results = pd.DataFrame(columns=columns)
f_anatomic_results = pd.DataFrame(columns=columns)
f_procedural_results = pd.DataFrame(columns=columns)
f_technical_results = pd.DataFrame(columns=columns)


for aux_ in ["'dialogue'", "'reduced hallucinations'", "'all phrases'", "'temporal context'"]:
    fb_df = pd.read_csv(f'results/extract_dialogue/metrics/fb {aux_}.csv')
    fb_df = fb_df.drop([i for i in fb_df.index if fb_df.iloc[i]['Case'] in remove])
    fb_df = group_by_param(fb_df, ['Model Type', 'Binary/Weighted'])
    
    behavior_df = pd.read_csv(f'results/extract_dialogue/metrics/behavior {aux_} compact.csv')
    behavior_df = group_by_param(behavior_df, ['Model Type', 'Binary/Weighted'])
    r_t = 'r_t_beh'
    r_t_beh_df = behavior_df[[
        (f'AUROC {r_t}', 'mean'), (f'AUROC {r_t}', 'std'),
        (f'Precision {r_t}', 'mean'), (f'Precision {r_t}', 'std'),
        (f'Recall {r_t}', 'mean'), (f'Recall {r_t}', 'std'),
        (f'F1 {r_t}', 'mean'), (f'F1 {r_t}', 'std'),
    ]]
    r_t = 'r_t_verb'
    r_t_verb_df = behavior_df[[
        (f'AUROC {r_t}', 'mean'), (f'AUROC {r_t}', 'std'),
        (f'Precision {r_t}', 'mean'), (f'Precision {r_t}', 'std'),
        (f'Recall {r_t}', 'mean'), (f'Recall {r_t}', 'std'),
        (f'F1 {r_t}', 'mean'), (f'F1 {r_t}', 'std'),
    ]]
    
    component_df = pd.read_csv(f'results/extract_dialogue/metrics/component {aux_} compact.csv')
    component_df = group_by_param(component_df, ['Model Type', 'Binary/Weighted'])
    f = 'f_anatomic'
    f_anatomic_df = component_df[[
        (f'AUROC {f}', 'mean'), (f'AUROC {f}', 'std'),
        (f'Precision {f}', 'mean'), (f'Precision {f}', 'std'),
        (f'Recall {f}', 'mean'), (f'Recall {f}', 'std'),
        (f'F1 {f}', 'mean'), (f'F1 {f}', 'std'),
    ]]
    f = 'f_procedural'
    f_procedural_df = component_df[[
        (f'AUROC {f}', 'mean'), (f'AUROC {f}', 'std'),
        (f'Precision {f}', 'mean'), (f'Precision {f}', 'std'),
        (f'Recall {f}', 'mean'), (f'Recall {f}', 'std'),
        (f'F1 {f}', 'mean'), (f'F1 {f}', 'std'),
    ]]
    f = 'f_technical'
    f_technical_df = component_df[[
        (f'AUROC {f}', 'mean'), (f'AUROC {f}', 'std'),
        (f'Precision {f}', 'mean'), (f'Precision {f}', 'std'),
        (f'Recall {f}', 'mean'), (f'Recall {f}', 'std'),
        (f'F1 {f}', 'mean'), (f'F1 {f}', 'std'),
    ]]
    
    fb_results.loc[len(fb_results)] = [
        aux_, 
        f"{fb_df.iloc[0]['AUROC']['mean']:.3f} (+/- {fb_df.iloc[0]['AUROC']['std']:.3f})", 
        f"{fb_df.iloc[0]['Precision']['mean']:.3f} (+/- {fb_df.iloc[0]['Precision']['std']:.3f})", 
        f"{fb_df.iloc[1]['Precision']['mean']:.3f} (+/- {fb_df.iloc[0]['Precision']['std']:.3f})",
        f"{fb_df.iloc[0]['Recall']['mean']:.3f} (+/- {fb_df.iloc[0]['Recall']['std']:.3f})", 
        f"{fb_df.iloc[1]['Recall']['mean']:.3f} (+/- {fb_df.iloc[0]['Recall']['std']:.3f})",
        f"{fb_df.iloc[0]['F1']['mean']:.3f} (+/- {fb_df.iloc[0]['Precision']['std']:.3f})", 
        f"{fb_df.iloc[1]['F1']['mean']:.3f} (+/- {fb_df.iloc[0]['F1']['std']:.3f})",
    ]
    
    # r_t_beh_results.loc[len(r_t_beh_results)] = [aux, r_t_beh_df.iloc[0]['AUROC r_t_beh']['mean'], r_t_beh_df.iloc[0]['Precision r_t_beh']['mean'], r_t_beh_df.iloc[1]['Precision r_t_beh']['mean'], r_t_beh_df.iloc[0]['Recall r_t_beh']['mean'], r_t_beh_df.iloc[1]['Recall r_t_beh']['mean'], r_t_beh_df.iloc[0]['F1 r_t_beh']['mean'], r_t_beh_df.iloc[1]['F1 r_t_beh']['mean']]
    # r_t_verb_results.loc[len(r_t_verb_results)] = [aux, r_t_verb_df.iloc[0]['AUROC r_t_verb']['mean'], r_t_verb_df.iloc[0]['Precision r_t_verb']['mean'], r_t_verb_df.iloc[1]['Precision r_t_verb']['mean'], r_t_verb_df.iloc[0]['Recall r_t_verb']['mean'], r_t_verb_df.iloc[1]['Recall r_t_verb']['mean'], r_t_verb_df.iloc[0]['F1 r_t_verb']['mean'], r_t_verb_df.iloc[1]['F1 r_t_verb']['mean']]
    
    # f_anatomic_results.loc[len(f_anatomic_results)] = [aux, f_anatomic_df.iloc[0]['AUROC f_anatomic']['mean'], f_anatomic_df.iloc[0]['Precision f_anatomic']['mean'], f_anatomic_df.iloc[1]['Precision f_anatomic']['mean'], f_anatomic_df.iloc[0]['Recall f_anatomic']['mean'], f_anatomic_df.iloc[1]['Recall f_anatomic']['mean'], f_anatomic_df.iloc[0]['F1 f_anatomic']['mean'], f_anatomic_df.iloc[1]['F1 f_anatomic']['mean']]
    # f_procedural_results.loc[len(f_procedural_results)] = [aux, f_procedural_df.iloc[0]['AUROC f_procedural']['mean'], f_procedural_df.iloc[0]['Precision f_procedural']['mean'], f_procedural_df.iloc[1]['Precision f_procedural']['mean'], f_procedural_df.iloc[0]['Recall f_procedural']['mean'], f_procedural_df.iloc[1]['Recall f_procedural']['mean'], f_procedural_df.iloc[0]['F1 f_procedural']['mean'], f_procedural_df.iloc[1]['F1 f_procedural']['mean']]
    # f_technical_results.loc[len(f_technical_results)] = [aux, f_technical_df.iloc[0]['AUROC f_technical']['mean'], f_technical_df.iloc[0]['Precision f_technical']['mean'], f_technical_df.iloc[1]['Precision f_technical']['mean'], f_technical_df.iloc[0]['Recall f_technical']['mean'], f_technical_df.iloc[1]['Recall f_technical']['mean'], f_technical_df.iloc[0]['F1 f_technical']['mean'], f_technical_df.iloc[1]['F1 f_technical']['mean']]


    r_t_beh_results.loc[len(r_t_beh_results)] = [
        aux_, 
        f"{r_t_beh_df.iloc[0]['AUROC r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['AUROC r_t_beh']['std']:.3f})", 
        f"{r_t_beh_df.iloc[0]['Precision r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['Precision r_t_beh']['std']:.3f})",
        f"{r_t_beh_df.iloc[1]['Precision r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['Precision r_t_beh']['std']:.3f})",
        f"{r_t_beh_df.iloc[0]['Recall r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['Recall r_t_beh']['std']:.3f})", 
        f"{r_t_beh_df.iloc[1]['Recall r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['Recall r_t_beh']['std']:.3f})",
        f"{r_t_beh_df.iloc[0]['F1 r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['F1 r_t_beh']['std']:.3f})", 
        f"{r_t_beh_df.iloc[1]['F1 r_t_beh']['mean']:.3f} (+/- {r_t_beh_df.iloc[0]['F1 r_t_beh']['std']:.3f})",
    ]
    r_t_verb_results.loc[len(r_t_verb_results)] = [
        aux_, 
        f"{r_t_verb_df.iloc[0]['AUROC r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['AUROC r_t_verb']['std']:.3f})", 
        f"{r_t_verb_df.iloc[0]['Precision r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['Precision r_t_verb']['std']:.3f})",
        f"{r_t_verb_df.iloc[1]['Precision r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['Precision r_t_verb']['std']:.3f})",
        f"{r_t_verb_df.iloc[0]['Recall r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['Recall r_t_verb']['std']:.3f})", 
        f"{r_t_verb_df.iloc[1]['Recall r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['Recall r_t_verb']['std']:.3f})",
        f"{r_t_verb_df.iloc[0]['F1 r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['F1 r_t_verb']['std']:.3f})", 
        f"{r_t_verb_df.iloc[1]['F1 r_t_verb']['mean']:.3f} (+/- {r_t_verb_df.iloc[0]['F1 r_t_verb']['std']:.3f})",
    ]
    
    f_anatomic_results.loc[len(f_anatomic_results)] = [
        aux_, 
        f"{f_anatomic_df.iloc[0]['AUROC f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['AUROC f_anatomic']['std']:.3f})", 
        f"{f_anatomic_df.iloc[0]['Precision f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['Precision f_anatomic']['std']:.3f})",
        f"{f_anatomic_df.iloc[1]['Precision f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['Precision f_anatomic']['std']:.3f})",
        f"{f_anatomic_df.iloc[0]['Recall f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['Recall f_anatomic']['std']:.3f})", 
        f"{f_anatomic_df.iloc[1]['Recall f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['Recall f_anatomic']['std']:.3f})",
        f"{f_anatomic_df.iloc[0]['F1 f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['F1 f_anatomic']['std']:.3f})", 
        f"{f_anatomic_df.iloc[1]['F1 f_anatomic']['mean']:.3f} (+/- {f_anatomic_df.iloc[0]['F1 f_anatomic']['std']:.3f})",
    ]
    f_procedural_results.loc[len(f_procedural_results)] = [
        aux_, 
        f"{f_procedural_df.iloc[0]['AUROC f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['AUROC f_procedural']['std']:.3f})", 
        f"{f_procedural_df.iloc[0]['Precision f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['Precision f_procedural']['std']:.3f})",
        f"{f_procedural_df.iloc[1]['Precision f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['Precision f_procedural']['std']:.3f})",
        f"{f_procedural_df.iloc[0]['Recall f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['Recall f_procedural']['std']:.3f})", 
        f"{f_procedural_df.iloc[1]['Recall f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['Recall f_procedural']['std']:.3f})",
        f"{f_procedural_df.iloc[0]['F1 f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['F1 f_procedural']['std']:.3f})", 
        f"{f_procedural_df.iloc[1]['F1 f_procedural']['mean']:.3f} (+/- {f_procedural_df.iloc[0]['F1 f_procedural']['std']:.3f})",
    ]
    f_technical_results.loc[len(f_technical_results)] = [
        aux_, 
        f"{f_technical_df.iloc[0]['AUROC f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['AUROC f_technical']['std']:.3f})", 
        f"{f_technical_df.iloc[0]['Precision f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['Precision f_technical']['std']:.3f})",
        f"{f_technical_df.iloc[1]['Precision f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['Precision f_technical']['std']:.3f})",
        f"{f_technical_df.iloc[0]['Recall f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['Recall f_technical']['std']:.3f})", 
        f"{f_technical_df.iloc[1]['Recall f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['Recall f_technical']['std']:.3f})",
        f"{f_technical_df.iloc[0]['F1 f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['F1 f_technical']['std']:.3f})", 
        f"{f_technical_df.iloc[1]['F1 f_technical']['mean']:.3f} (+/- {f_technical_df.iloc[0]['F1 f_technical']['std']:.3f})",
    ]

In [None]:
f_technical_results