In [2]:
import pandas as pd
from scipy.stats import wilcoxon
from utils import *

In [10]:
df = load_data()

print("Trust change within one condition: comparing s2 and s3")
print("At each step, answer equal to AI answer when AI is correct - answer equal to AI answer when AI is incorrect")
print()
# Trust within one condition: comparing s2 and s3
for i in range(1, 7):
    temp = df[df['condition'] == i]
    print('Condition: ' + str(i))
    change_rate_s2 = np.array([])
    change_rate_s3 = np.array([])
    for index, row in temp.iterrows():
        s2_correct = 0
        s3_correct = 0
        s2_incorrect = 0
        s3_incorrect = 0
        ai_same_incorrect = 0
        for q in range(1, 21):
            if row[f'{q}_ai'] == row[f'{q}_gt']:
                if row[f'{q}_ai'] == row[f'{q}_2']:
                    s2_correct += 1
                if row[f'{q}_ai'] == row[f'{q}_3']:
                    s3_correct += 1
            else:
                if row[f'{q}_ai'] == row[f'{q}_2']:
                    s2_incorrect += 1
                if row[f'{q}_ai'] == row[f'{q}_3']:
                    s3_incorrect += 1
        change_rate_s2 = np.append(change_rate_s2, (s2_correct/20) - (s2_incorrect/20))
        change_rate_s3 = np.append(change_rate_s3, (s3_correct/20) - (s3_incorrect/20))
    # get standard deviation of pre and post scores
    s2_mean = np.mean(change_rate_s2)
    s3_mean = np.mean(change_rate_s3)
    s2_std = np.std(change_rate_s2)
    s3_std = np.std(change_rate_s3)
    stat, p_value = wilcoxon(change_rate_s2, change_rate_s3)
    sig = " *" if p_value < 0.05 else ""
    
    print(f"    s2 trust change: mean = {s2_mean:.5}, std = {s2_std:.5}")
    print(f"    s3 trust change mean = {s3_mean:.5}, std = {s3_std:.5}")
    print(f"    Wilcoxon signed-rank test: p-value = {p_value} {sig}")
    print()



Trust change within one condition: comparing s2 and s3
At each step, answer equal to AI answer when AI is correct - answer equal to AI answer when AI is incorrect

Condition: 1
    s2 trust change: mean = 0.35556, std = 0.10123
    s3 trust change mean = 0.38056, std = 0.064848
    Wilcoxon signed-rank test: p-value = 0.1391167108102643 

Condition: 2
    s2 trust change: mean = 0.45556, std = 0.084802
    s3 trust change mean = 0.46111, std = 0.07738
    Wilcoxon signed-rank test: p-value = 0.46889018958080375 

Condition: 3
    s2 trust change: mean = 0.50833, std = 0.067185
    s3 trust change mean = 0.54167, std = 0.073125
    Wilcoxon signed-rank test: p-value = 0.01151382807684542  *

Condition: 4
    s2 trust change: mean = 0.45278, std = 0.094974
    s3 trust change mean = 0.45833, std = 0.097539
    Wilcoxon signed-rank test: p-value = 0.15729920705028502 

Condition: 5
    s2 trust change: mean = 0.42778, std = 0.088541
    s3 trust change mean = 0.48889, std = 0.1185
    Wil

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [11]:
import numpy as np
from scipy.stats import shapiro, kruskal, f_oneway, levene, mannwhitneyu

def kruskal_wallis_or_anova(conditions):
    # Step 1: Check normality for each condition
    normality_results = {}
    all_normal = True  # To track if all conditions are normally distributed
    for i, condition in enumerate(conditions, start=1):
        stat, p_value = shapiro(condition)
        normality_results[f'Condition {i}'] = p_value
        # print(f'Condition {i} - Shapiro-Wilk p-value: {p_value}')
        if p_value < 0.05:
            all_normal = False

    # Step 2: Check for homogeneity of variance using Levene's test
    stat, p_value_levene = levene(*conditions)
    # print(f"Levene's test for equal variances p-value: {p_value_levene}")
    equal_variances = p_value_levene >= 0.05

    # Step 3: Decide whether to use ANOVA or Kruskal-Wallis
    if all_normal and equal_variances:
        # print("Data is normally distributed and variances are equal. Performing one-way ANOVA...")
        # Perform one-way ANOVA
        stat, p_value_anova = f_oneway(*conditions)
        p_value_anova_sig = " *" if p_value_anova < 0.05 else ""
        print(f"ANOVA test statistic: {stat} {p_value_anova_sig}")
        print(f"P-value: {p_value_anova}")
        
        if p_value_anova < 0.05:
            print("Significant result found in ANOVA. Performing post-hoc analysis with Tukey's HSD...")
            # Prepare data for Tukey's HSD
            data = np.concatenate(conditions)
            groups = np.array([f'Condition {i+1}' for i in range(len(conditions)) for _ in range(len(conditions[i]))])
            tukey_results = pairwise_tukeyhsd(data, groups)
            print("    " + tukey_results)
    else:
        # print("Normality or equal variance assumption violated. Performing Kruskal-Wallis test...")
        # Perform Kruskal-Wallis test
        stat, p_value_kruskal = kruskal(*conditions)
        p_value_kruskal_sig = " *" if p_value_kruskal < 0.05 else ""
        print(f"Kruskal-Wallis test statistic: {stat} {p_value_kruskal_sig}")
        print(f"P-value: {p_value_kruskal}")
        
        if p_value_kruskal < 0.05:
            print("Significant result found in Kruskal-Wallis test. Performing post-hoc pairwise Mann-Whitney U tests...")
            # Perform pairwise Mann-Whitney U tests
            num_conditions = len(conditions)
            for i in range(num_conditions):
                for j in range(i + 1, num_conditions):
                    stat, p_value_mannwhitney = mannwhitneyu(conditions[i], conditions[j])
                    p_value_mannwhitney_sig = " *" if p_value_mannwhitney < 0.05 else ""
                    print(f'    Condition {i+1} vs Condition {j+1} - Mann-Whitney U test p-value: {p_value_mannwhitney} {p_value_mannwhitney_sig}')
            print()

    # calculate mean and std for each condition
    for i, condition in enumerate(conditions, start=1):
        mean = np.mean(condition)
        std = np.std(condition)
        print(f"    Condition {i}: mean = {mean:.5}, std = {std:.5}")

print(f"Across all conditions, comparing trust change in s2 and s3")
print("In each step, trust change = (answer == AI answer when AI answer == GT) - (answer == AI answer when AI answer != GT)")
print()
for s in range(2, 4):
    print(f"Step {s} analysis:\n")
    conditions = []
    for c in range(1, 7):
        current = np.array([])
        temp = df[df['condition'] == c]
        for i, row in temp.iterrows():
            ai_same_correct = 0
            ai_same_incorrect = 0
            for q in range(1, 21):
                if row[f'{q}_ai'] == row[f'{q}_gt']:
                    if row[f'{q}_ai'] == row[f'{q}_{s}']:
                        ai_same_correct += 1
                else:
                    if row[f'{q}_ai'] == row[f'{q}_{s}']:
                        ai_same_incorrect += 1
            current = np.append(current, (ai_same_correct/20) - (ai_same_incorrect/20))
        conditions.append(current)

    kruskal_wallis_or_anova(conditions)
    print()

Across all conditions, comparing trust change in s2 and s3
In each step, trust change = (answer == AI answer when AI answer == GT) - (answer == AI answer when AI answer != GT)

Step 2 analysis:

Kruskal-Wallis test statistic: 25.797368883251515  *
P-value: 9.768346331472466e-05
Significant result found in Kruskal-Wallis test. Performing post-hoc pairwise Mann-Whitney U tests...
    Condition 1 vs Condition 2 - Mann-Whitney U test p-value: 0.0013506257525764357  *
    Condition 1 vs Condition 3 - Mann-Whitney U test p-value: 2.544082419698534e-05  *
    Condition 1 vs Condition 4 - Mann-Whitney U test p-value: 0.0023803549833249075  *
    Condition 1 vs Condition 5 - Mann-Whitney U test p-value: 0.014580870343033287  *
    Condition 1 vs Condition 6 - Mann-Whitney U test p-value: 7.314594205400932e-05  *
    Condition 2 vs Condition 3 - Mann-Whitney U test p-value: 0.13330775147950868 
    Condition 2 vs Condition 4 - Mann-Whitney U test p-value: 0.7600459454747588 
    Condition 2 vs C