In [1]:
from util import load_user_data
import pandas as pd
import numpy as np
import sys

In [2]:
from scipy.stats import kruskal, mannwhitneyu
def post_hoc_comparison(data_list_1, data_list_2, name1, name2):
	print("Use pots-hoc analysis")
	threshold = 0.05 / 4
	flag = False
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='greater')
	if pvalue < threshold:
		print("Alternative {} > {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	statistic, pvalue = mannwhitneyu(data_list_1, data_list_2, alternative='less')
	if pvalue < threshold:
		print("Alternative {} < {},".format(name1, name2), "pvalue %.4f"%pvalue, "statistic %.4f"%statistic)
		flag = True
	if not flag:
		print("No significant difference with post-hoc analysis")


In [4]:
valid_users, tp_data, _ = load_user_data(filename="../data/xailabdata_all.csv")
valid_users_2, tp_data_2, _ = load_user_data(filename="../data/xailabdata_llm_agent.csv")

valid_users = valid_users | valid_users_2
tp_data.update(tp_data_2)
variable_dict = {}
variable_dict["condition"] = []
dimensions = ["perceived_feature_understanding", "completeness", "coherence", "clarity", \
    "learning_effect", "understanding_of_system", "usefulness_explanation", "objective_feature_understanding"]
for dimension in dimensions:
    variable_dict[dimension] = []
for dimension in ["Propensity to Trust", "Familiarity", "ATI", "mlbackground"]:
    variable_dict[dimension] = []
for user in valid_users:
    tp_condition = tp_data[user]["condition"]
    if tp_condition == "control":
        continue
    trust = tp_data[user]["Trust_in_automation"]
    explanation_understanding = tp_data[user]["explanation_understanding"]
    variable_dict["condition"].append(tp_condition)
    for dimension in dimensions:
        variable_dict[dimension].append(explanation_understanding[dimension])
    for dimension in ["Propensity to Trust", "Familiarity"]:
        variable_dict[dimension].append(trust[dimension])
    for dimension in ["ATI", "mlbackground"]:
        variable_dict[dimension].append(tp_data[user][dimension])

df = pd.DataFrame(variable_dict)
print(df.shape)

245 valid participants
{'control': 61, 'dashboard': 61, 'chatxai': 62, 'chatxaiboost': 61, 'chatxaiAuto': 0}
91 participants blindly rely on AI advice
{'control': 8, 'dashboard': 27, 'chatxai': 32, 'chatxaiboost': 24, 'chatxaiAuto': 0}
61 valid participants
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 61}
25 participants blindly rely on AI advice
{'control': 0, 'dashboard': 0, 'chatxai': 0, 'chatxaiboost': 0, 'chatxaiAuto': 25}
(245, 13)


In [5]:
print("-" * 34)
from pingouin import ancova, anova
print("For all participants, compare with experimental conditions")
for dimension in dimensions:
    print(dimension)
    aov = anova(dv=dimension, between='condition', data=df, effsize='n2')
    print(aov.round(3))
    print(df.groupby(['condition'])[dimension].mean().round(2))
    print(df.groupby(['condition'])[dimension].std().round(2))
    print("-" * 17)

----------------------------------
For all participants, compare with experimental conditions
perceived_feature_understanding
      Source  ddof1  ddof2      F  p-unc     n2
0  condition      3    241  0.585  0.625  0.007
condition
chatxai         4.24
chatxaiAuto     4.08
chatxaiboost    4.07
dashboard       4.10
Name: perceived_feature_understanding, dtype: float64
condition
chatxai         0.72
chatxaiAuto     0.92
chatxaiboost    0.79
dashboard       0.89
Name: perceived_feature_understanding, dtype: float64
-----------------
completeness
      Source  ddof1  ddof2      F  p-unc     n2
0  condition      3    241  0.131  0.942  0.002
condition
chatxai         3.56
chatxaiAuto     3.60
chatxaiboost    3.53
dashboard       3.60
Name: completeness, dtype: float64
condition
chatxai         0.72
chatxaiAuto     0.77
chatxaiboost    0.64
dashboard       0.67
Name: completeness, dtype: float64
-----------------
coherence
      Source  ddof1  ddof2      F  p-unc     n2
0  condition      3  

  return warn(


In [6]:
print("-" * 34)
from pingouin import ancova, anova
print("For all participants, compare with experimental conditions")
for dimension in dimensions:
    print(dimension)
    # aov = anova(dv=dimension, between='condition', data=df, effsize='n2')
    aov = ancova(dv=dimension, covar=["Propensity to Trust", "Familiarity", "ATI", "mlbackground"], between='condition', data=df, effsize='n2')
    print(aov.round(3))
    tp_dict = aov.to_dict()
    # f_list = ["{:.2f}".format(tp_dict['F'][index]) for index in range(0, 5)]
    # p_list = ["{:.2f}".format(tp_dict['p-unc'][index]) for index in range(0, 5)]
    tp_str = "{:.2f} & {}".format(tp_dict['F'][0], "{:.3f}".format(tp_dict['p-unc'][0])[1:])
    for index in range(1,5):
        tp_str += " & {:.2f} & {}".format(tp_dict['F'][index], "{:.3f}".format(tp_dict['p-unc'][index])[1:])
    tp_str += "\\\\"
    print(tp_str)
    print(df.groupby(['condition'])[dimension].mean().round(2))
    print(df.groupby(['condition'])[dimension].std().round(2))
    print("-" * 17)

----------------------------------
For all participants, compare with experimental conditions
perceived_feature_understanding
                Source       SS   DF       F  p-unc     n2
0            condition    1.074    3   0.610  0.609  0.007
1  Propensity to Trust   23.590    1  40.150  0.000  0.144
2          Familiarity    0.172    1   0.293  0.589  0.001
3                  ATI    0.192    1   0.327  0.568  0.001
4         mlbackground    0.053    1   0.089  0.765  0.000
5             Residual  139.247  237     NaN    NaN    NaN
0.61 & .609 & 40.15 & .000 & 0.29 & .589 & 0.33 & .568 & 0.09 & .765\\
condition
chatxai         4.24
chatxaiAuto     4.08
chatxaiboost    4.07
dashboard       4.10
Name: perceived_feature_understanding, dtype: float64
condition
chatxai         0.72
chatxaiAuto     0.92
chatxaiboost    0.79
dashboard       0.89
Name: perceived_feature_understanding, dtype: float64
-----------------
completeness
                Source      SS   DF       F  p-unc     n2
0    

In [7]:
for dimension in ["objective_feature_understanding"]:
    print(dimension)
    var_dict = {}
    all_conditions = ["dashboard", "chatxai", "chatxaiboost", "chatxaiAuto"]
    for condition in all_conditions:
        tp_df = df[df['condition'] == condition]
        var_dict[condition] = tp_df[dimension].tolist()
        # print(len(var_dict[condition]))
    kwargs = [var_dict[condition] for condition in all_conditions]
    statistic, pvalue = kruskal(*kwargs)
    print("kruskal test result: H:{:.2f}, p:{:.3f}".format(statistic, pvalue))
    for condition in all_conditions:
        data_list_1 = var_dict[condition]
        print("{}, Mean: M({}):{:.2f}, SD({}):{:.2f}".format(len(data_list_1), condition, np.mean(data_list_1), condition, np.std(data_list_1)))
    if pvalue < 0.05 / 4:
        length = len(all_conditions)
        print("-" * 34)
        for i in range(length - 1):
            for j in range(i+1, length):
                group_1 = all_conditions[i]
                group_2 = all_conditions[j]
                data_list_1 = var_dict[group_1]
                data_list_2 = var_dict[group_2]
                post_hoc_comparison(data_list_1, data_list_2, group_1, group_2)
        print("-" * 34)

objective_feature_understanding
kruskal test result: H:16.19, p:0.001
61, Mean: M(dashboard):0.88, SD(dashboard):0.07
62, Mean: M(chatxai):0.88, SD(chatxai):0.08
61, Mean: M(chatxaiboost):0.87, SD(chatxaiboost):0.08
61, Mean: M(chatxaiAuto):0.84, SD(chatxaiAuto):0.06
----------------------------------
Use pots-hoc analysis
No significant difference with post-hoc analysis
Use pots-hoc analysis
No significant difference with post-hoc analysis
Use pots-hoc analysis
Alternative dashboard > chatxaiAuto, pvalue 0.0007 statistic 2487.5000
Use pots-hoc analysis
No significant difference with post-hoc analysis
Use pots-hoc analysis
Alternative chatxai > chatxaiAuto, pvalue 0.0004 statistic 2559.0000
Use pots-hoc analysis
Alternative chatxaiboost > chatxaiAuto, pvalue 0.0006 statistic 2492.5000
----------------------------------
