In [1]:
from util import load_user_data
import pandas as pd
import sys
import numpy as np


valid_users, tp_data, _ = load_user_data(filename="../data/xailabdata_all.csv")
valid_users_2, tp_data_2, _ = load_user_data(filename="../data/xailabdata_llm_agent.csv")

valid_users = valid_users | valid_users_2
tp_data.update(tp_data_2)

all_conditions = ["control", "dashboard", "chatxai", "chatxaiboost", "chatxaiAuto"]
engagement_list = []
engagement_dict = {}
variable_dict = {}
variable_dict["condition"] = []
variable_dict["engagement"] = []
for dimension in ["Propensity to Trust", "Familiarity", "ATI", "mlbackground"]:
    variable_dict[dimension] = []
for condition in all_conditions:
    engagement_dict[condition] = []
for user in valid_users:
    tp_condition = tp_data[user]["condition"]
    trust = tp_data[user]["Trust_in_automation"]
    # explanation_understanding = tp_data[user]["explanation_understanding"]
    variable_dict["condition"].append(tp_condition)
    user_engagement = tp_data[user]["user_engagement_scale"]
    variable_dict["engagement"].append(user_engagement)
    for dimension in ["Propensity to Trust", "Familiarity"]:
        variable_dict[dimension].append(trust[dimension])
    for dimension in ["ATI", "mlbackground"]:
        variable_dict[dimension].append(tp_data[user][dimension])
    
    engagement_dict[tp_condition].append(user_engagement)
    engagement_list.append(user_engagement)
print("For all users, M: {:.2f}, SD: {:.2f}".format(np.mean(engagement_list), np.std(engagement_list)))
for condition in all_conditions:
    print(condition, len(engagement_dict[condition]))
    print("M: {:.2f}, SD: {:.2f}".format(np.mean(engagement_dict[condition]), np.std(engagement_dict[condition])))


245 valid participants
{'control': 61, 'dashboard': 61, 'chatxai': 62, 'chatxaiboost': 61, 'chatxaiAuto': 0}
91 participants blindly rely on AI advice
{'control': 8, 'dashboard': 27, 'chatxai': 32, 'chatxaiboost': 24, 'chatxaiAuto': 0}
61 valid participants
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 61}
25 participants blindly rely on AI advice
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 25}
For all users, M: 3.28, SD: 0.68
control 61
M: 3.15, SD: 0.72
dashboard 61
M: 3.33, SD: 0.66
chatxai 62
M: 3.20, SD: 0.63
chatxaiboost 61
M: 3.28, SD: 0.67
chatxaiAuto 61
M: 3.44, SD: 0.71


In [2]:
from scipy.stats import kruskal, mannwhitneyu

def post_hoc_comparison(data_list_1, data_list_2, name1, name2):
	print("Use pots-hoc analysis")
	threshold = 0.05 / 4
	flag = False
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='greater')
	if pvalue < threshold:
		print("Alternative {} > {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='less')
	if pvalue < threshold:
		print("Alternative {} < {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	if not flag:
		print("No significant difference with post-hoc analysis")

In [3]:
print("For all participants, compare with experimental conditions")
kwargs = [engagement_dict[condition] for condition in all_conditions]
print(len(kwargs))
statistic, pvalue = kruskal(*kwargs)
print("kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
tp_str = "&" + "{:.2f} & {:.3f}& ".format(statistic, pvalue)
for condition in all_conditions:
    data_list_1 = engagement_dict[condition]
    print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
    tp_str += "${:.2f} \\pm {:.2f}$ &".format(np.mean(data_list_1), np.std(data_list_1))
print(tp_str)
# In the prior test, shall we use 0.05 and then reach a conclusion with calibrated threshold?
if pvalue < 0.05 / 4:
    length = len(all_conditions)
    for i in range(length - 1):
        for j in range(i+1, length):
            group_1 = all_conditions[i]
            group_2 = all_conditions[j]
            data_list_1 = engagement_dict[group_1]
            data_list_2 = engagement_dict[group_2]
            post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
print("-" * 17)

For all participants, compare with experimental conditions
5
kruskal test result: H:7.14, p:0.129
61, Mean: M(control):3.15, SD(control):0.72
61, Mean: M(dashboard):3.33, SD(dashboard):0.66
62, Mean: M(chatxai):3.20, SD(chatxai):0.63
61, Mean: M(chatxaiboost):3.28, SD(chatxaiboost):0.67
61, Mean: M(chatxaiAuto):3.44, SD(chatxaiAuto):0.71
&7.14 & 0.129& $3.15 \pm 0.72$ &$3.33 \pm 0.66$ &$3.20 \pm 0.63$ &$3.28 \pm 0.67$ &$3.44 \pm 0.71$ &
-----------------


In [4]:
df = pd.DataFrame(variable_dict)
print(df.shape)
df.head()

(306, 6)


Unnamed: 0,condition,engagement,Propensity to Trust,Familiarity,ATI,mlbackground
0,chatxaiAuto,4.416667,3.333333,1.5,1.555556,0
1,chatxaiAuto,4.0,3.666667,3.0,4.777778,1
2,dashboard,3.5,3.0,2.0,3.555556,0
3,chatxaiboost,3.666667,3.333333,2.0,3.666667,1
4,chatxaiAuto,4.75,5.0,4.0,4.888889,1


In [5]:
print("-" * 34)
df = pd.DataFrame(variable_dict)
print(df.shape)
from pingouin import ancova, anova
print("For all participants, compare with experimental conditions")
# for dimension in dimensions:
dimension = "engagement"
print(dimension)
aov = anova(dv=dimension, between='condition', data=df, effsize='n2')
print(aov.round(3))
aov = ancova(dv="engagement", covar=["Propensity to Trust", "Familiarity", "ATI", "mlbackground"], between='condition', data=df, effsize='n2')
print(aov.round(3))
tp_dict = aov.to_dict()
# f_list = ["{:.2f}".format(tp_dict['F'][index]) for index in range(0, 5)]
# p_list = ["{:.2f}".format(tp_dict['p-unc'][index]) for index in range(0, 5)]
tp_str = "{:.2f} & {}".format(tp_dict['F'][0], "{:.3f}".format(tp_dict['p-unc'][0])[1:])
for index in range(1,5):
    tp_str += " & {:.2f} & {}".format(tp_dict['F'][index], "{:.3f}".format(tp_dict['p-unc'][index])[1:])
tp_str += "\\\\"
print(tp_str)
print(df.groupby(['condition'])[dimension].mean().round(2))
print(df.groupby(['condition'])[dimension].std().round(2))
print("-" * 17)

----------------------------------
(306, 6)
For all participants, compare with experimental conditions
engagement
      Source  ddof1  ddof2     F  p-unc     n2
0  condition      4    301  1.63  0.167  0.021


  return warn(


                Source       SS   DF       F  p-unc     n2
0            condition    2.656    4   1.957  0.101  0.020
1  Propensity to Trust   21.740    1  64.085  0.000  0.166
2          Familiarity    1.136    1   3.349  0.068  0.009
3                  ATI    4.469    1  13.173  0.000  0.034
4         mlbackground    0.484    1   1.427  0.233  0.004
5             Residual  100.755  297     NaN    NaN    NaN
1.96 & .101 & 64.09 & .000 & 3.35 & .068 & 13.17 & .000 & 1.43 & .233\\
condition
chatxai         3.20
chatxaiAuto     3.44
chatxaiboost    3.28
control         3.15
dashboard       3.33
Name: engagement, dtype: float64
condition
chatxai         0.63
chatxaiAuto     0.72
chatxaiboost    0.67
control         0.72
dashboard       0.66
Name: engagement, dtype: float64
-----------------
