In [1]:
from util import load_user_data
import pandas as pd
import numpy as np
from scipy.stats import kruskal, mannwhitneyu

def post_hoc_comparison(data_list_1, data_list_2, name1, name2):
	print("Use pots-hoc analysis")
	threshold = 0.05 / 4
	flag = False
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='greater')
	if pvalue < threshold:
		print("Alternative {} > {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='less')
	if pvalue < threshold:
		print("Alternative {} < {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	if not flag:
		print("No significant difference with post-hoc analysis")

In [2]:
valid_users, tp_data, _ = load_user_data(filename="../data/xailabdata_all.csv")
valid_users_2, tp_data_2, _ = load_user_data(filename="../data/xailabdata_llm_agent.csv")

valid_users = valid_users | valid_users_2
tp_data.update(tp_data_2)
variable_dict = {}
variable_dict["condition"] = []
dimensions = ["accuracy", "agreement_fraction", "switch_fraction", "accuracy-wid", "RAIR", "RSR"]
all_conditions = ["control", "dashboard", "chatxai", "chatxaiboost", "chatxaiAuto"]
for condition in all_conditions:
    variable_dict[condition] = {}
    for dimension in dimensions:
        variable_dict[condition][dimension] = []

for user in valid_users:
    tp_condition = tp_data[user]["condition"]
    reliance_measures = tp_data[user]["reliance_measures"]
    for dimension in dimensions:
        variable_dict[tp_condition][dimension].append(reliance_measures[dimension])

print("-" * 34)

245 valid participants
{'control': 61, 'dashboard': 61, 'chatxai': 62, 'chatxaiboost': 61, 'chatxaiAuto': 0}
91 participants blindly rely on AI advice
{'control': 8, 'dashboard': 27, 'chatxai': 32, 'chatxaiboost': 24, 'chatxaiAuto': 0}
61 valid participants
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 61}
25 participants blindly rely on AI advice
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 25}
----------------------------------


In [3]:
print("For all participants, compare with experimental conditions")
for dimension in dimensions:
    print(dimension)
    kwargs = [variable_dict[condition][dimension] for condition in all_conditions]
    statistic, pvalue = kruskal(*kwargs)
    print("kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
    tp_str = dimension + "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
    for condition in all_conditions:
        data_list_1 = variable_dict[condition][dimension]
        print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
        tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
    print(tp_str)
    # In the prior test, shall we use 0.05 and then reach a conclusion with calibrated threshold?
    if pvalue < 0.05:
        length = len(all_conditions)
        for i in range(length - 1):
            for j in range(i+1, length):
                group_1 = all_conditions[i]
                group_2 = all_conditions[j]
                data_list_1 = variable_dict[group_1][dimension]
                data_list_2 = variable_dict[group_2][dimension]
                post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
    print("-" * 17)

For all participants, compare with experimental conditions
accuracy
kruskal test result: H:9.09, p:0.059
61, Mean: M(control):0.62, SD(control):0.13
61, Mean: M(dashboard):0.65, SD(dashboard):0.11
62, Mean: M(chatxai):0.67, SD(chatxai):0.10
61, Mean: M(chatxaiboost):0.64, SD(chatxaiboost):0.09
61, Mean: M(chatxaiAuto):0.63, SD(chatxaiAuto):0.10
accuracy&9.09 & 0.059& $0.62 \pm 0.13$ &$0.65 \pm 0.11$ &$0.67 \pm 0.10$ &$0.64 \pm 0.09$ &$0.63 \pm 0.10$ &
-----------------
agreement_fraction
kruskal test result: H:33.66, p:0.000
61, Mean: M(control):0.74, SD(control):0.17
61, Mean: M(dashboard):0.86, SD(dashboard):0.17
62, Mean: M(chatxai):0.89, SD(chatxai):0.15
61, Mean: M(chatxaiboost):0.85, SD(chatxaiboost):0.16
61, Mean: M(chatxaiAuto):0.89, SD(chatxaiAuto):0.11
agreement_fraction&33.66 & 0.000& $0.74 \pm 0.17$ &$0.86 \pm 0.17$ &$0.89 \pm 0.15$ &$0.85 \pm 0.16$ &$0.89 \pm 0.11$ &
Use pots-hoc analysis
Alternative control < dashboard, pvalue 0.0000 statistic 1088.0000
Use pots-hoc analy