In [49]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

df = pd.read_csv('./data-final.csv')

# remove header
df = df[2:]
head = df.head(0)

drug = df[df['1_qs-drug-ans'].apply(lambda x: isinstance(x, str))]
pm = df[df['1_qs-pm-ans'].apply(lambda x: isinstance(x, str))]
shoplift = df[df['1_qs-shoplift-ans'].apply(lambda x: isinstance(x, str))]

sources_df = {
    'drug': drug,
    'pm': pm,
    'shoplift': shoplift
}

: 

In [None]:
pd.set_option('future.no_silent_downcasting', True)

conditions = ['control', 'honest summary', 'honest chatbot', 'misleading summary', 'misleading chatbot']
sources = ['drug', 'pm', 'shoplift']


def sum_ans(df, condition, type="false"):
    real_col = [f"{i}_qs-{source}-ans" for i in [*range(1,11)]]
    fake_col = [f"{i}_qs-{source}-ans" for i in [*range(11,16)]]
    
    yes = { "1": 1, "2": 0, "3": 0}
    unsure = { "1": 0, "2": 1, "3": 0}
    no = { "1": 0, "2": 0, "3": 1}
    
    types = {
        "true": [yes,no],
        "unsure": [unsure,unsure],
        "false": [no,yes]
    }
    
    df_real = df[df['condition'] == condition][real_col].replace(types[type][0])
    df_fake = df[df['condition'] == condition][fake_col].replace(types[type][1])
    df_overall = df_real.sum(axis=1) + df_fake.sum(axis=1)
        
    print(f"{condition} -> mean: {"{:.2f}".format(df_overall.mean())} | raw: {df_overall.tolist()}")
    return [df_real.sum(axis=1), df_fake.sum(axis=1), df_overall]
    
overall = {
    'control': [[], [], []],
    'honest summary': [[], [], []],
    'honest chatbot': [[], [], []],
    'misleading summary': [[], [], []],
    'misleading chatbot': [[], [], []]   
}

for source in sources:
    print(source)
    for condition in conditions:
        score = sum_ans(sources_df[source], condition)
        for i in range(3):
            overall[condition][i] += score[i].tolist()
        # overall[condition] += score.tolist()[0:24]
    print()

In [None]:
import numpy as np
from scipy.stats import shapiro, kruskal
from scikit_posthocs import posthoc_dunn
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rc('font', size=18)

def prepare(groups):
    means = []
    errors = []
    sds = []

    for group in groups:
        mean = np.mean(groups[group])

        # Calculate the standard error as the error bar (standard deviation divided by the square root of the sample size)
        std = np.std(groups[group])
        error = np.std(groups[group]) / np.sqrt(len(groups[group]))
        means.append(mean)
        sds.append(std)
        errors.append(error)
        
        print(f"{group}: \\textit{{M}}={"{:.3f}".format(mean)}, \\textit{{err}}={"{:.3f}".format(error)}, \\textit{{s.d.}}={"{:.3f}".format(std)};")

    return [means, errors, sds]

def plot(data, sig, limit=100):
    ordered_labels = ['', '', '', '', '']
    colors = ["#D5D5D5", "#AEBA8B", "#6A9869", "#F5793E", "#DA3925"]

    # X positions
    x_pos = np.arange(len(conditions))

    # *    P ≤ 0.05
    # **   P ≤ 0.01
    # ***  P ≤ 0.001
    # **** P ≤ 0.0001

    significance_values = []
    for s in sig:
        [val, ii, jj] = s
        star = "*"
        if(val <= 0.0001): star = "****"
        elif(val <= 0.001): star = "***"
        elif(val <= 0.01): star = "**"
        significance_values.append(((ii-1,jj-1), f"{star} P={"{:.5f}".format(val)}"))

    [means, errors, sds] = prepare(data)

    # Creating the bar plot with error bars
    plt.figure(figsize=(12, 7 if limit == 30 else 8))
    plt.bar(x_pos, means, yerr=errors, capsize=5, color=colors)
    # plt.title('Recollection Score by Experimental Condition with Error Bars')
    # plt.ylabel('Average Score')
    # plt.xlabel('Experimental Condition')
    plt.xticks(x_pos, ordered_labels)

    # Adding significance markers with p-values, separate and not overlapping
    y_base_offset = 0.1  # Base vertical offset for significance lines
    y_increment = 0.4  # Increment for each subsequent significance marker
    
    if(limit == 30):
        y_base_offset = 1
        y_increment = 2.5
        
    if(limit == 100):
        y_base_offset = 2
        y_increment = 8

    # Initialize the first y position above the highest bar
    current_y_position = max(means) + max(errors) + y_base_offset

    for idx, ((i, j), label) in enumerate(significance_values):
        x1, x2 = x_pos[i], x_pos[j]
        # print("{1} and {2} with id: {0}".format(idx, x1, x2))
        y = current_y_position
        h, col = 1 if limit == 30 else 2 if limit == 100 else 0.1, 'k'
        plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
        plt.text((x1 + x2) * 0.5, y + h, label, ha='center', va='bottom', color=col, 
                 family='Helvetica Neue')
        current_y_position += y_increment  # Update the y position for the next marker

    plt.ylim(-8 if limit == 30 else 0, 104 if limit == 100 else limit + 0.2)
    plt.tight_layout()
    plt.show()

def stats(groups, limit=100):
    print("\nSHAPIRO-WILK")
    for group in groups:
        print(group, "=>", shapiro(groups[group]))

    print("\nKRUSKAL")
    k = kruskal(*groups.values())
    print(k)

    print("\nDUNN")
    print(posthoc_dunn([*groups.values()]))
    
    sig = []
    dunn_dict = posthoc_dunn([*groups.values()]).to_dict()
    
    for ii in range(2,6):
        for jj in range(1, ii):
            if(dunn_dict[ii][jj] < 0.05):
                sig.append([dunn_dict[ii][jj], ii, jj])
    
    #print("\nSTATS")
    # prepare(groups)
    plot(groups, sig if k.pvalue < 0.05 else [], limit)

def sum_ans(s, condition, type="false"):
    df = sources_df[s]
    real_col = [f"{i}_qs-{s}-ans" for i in [*range(1,11)]]
    fake_col = [f"{i}_qs-{s}-ans" for i in [*range(11,16)]]
    
    yes = { "1": 1, "2": 0, "3": 0}
    unsure = { "1": 0, "2": 1, "3": 0}
    no = { "1": 0, "2": 0, "3": 1}
    
    types = {
        "true": [yes,no],
        "unsure": [unsure,unsure],
        "false": [no,yes]
    }
    
    df_real = df[df['condition'] == condition][real_col].replace(types[type][0])
    df_fake = df[df['condition'] == condition][fake_col].replace(types[type][1])
    df_overall = df_real.sum(axis=1) + df_fake.sum(axis=1)
        
    # print(f"{condition} -> mean: {"{:.2f}".format(df_overall.mean())} | raw: {df_overall.tolist()}")
    return [df_real.sum(axis=1), df_fake.sum(axis=1), df_overall]

def get_data(type="false"):
    overall = {
        'control': [[], [], []],
        'honest summary': [[], [], []],
        'honest chatbot': [[], [], []],
        'misleading summary': [[], [], []],
        'misleading chatbot': [[], [], []]   
    }

    for source in sources:
        # print(source)
        for condition in conditions:
            score = sum_ans(source, condition, type)
            for i in range(3):
                overall[condition][i] += [x * 20 for x in score[i].tolist()]
            # overall[condition] += score.tolist()[0:24]
        # print()
        
    stats({ x: overall[x][1] for x in overall })

for type in ['true', 'unsure', 'false']:
    print(type)
    get_data(type)

In [None]:
def sum_score(s, condition, type="false"):
    df = sources_df[s]
    real_ans = [f"{i}_qs-{s}-ans" for i in [*range(1,11)]]
    fake_ans = [f"{i}_qs-{s}-ans" for i in [*range(11,16)]]
    real_conf = [f"{i}_qs-{s}-conf" for i in [*range(1,11)]]
    fake_conf = [f"{i}_qs-{s}-conf" for i in [*range(11,16)]]
    
    yes = { "1": 1, "2": 0, "3": 0}
    unsure = { "1": 0, "2": 1, "3": 0}
    no = { "1": 0, "2": 0, "3": 1}
    
    types = {
        "true": [yes,no],
        "unsure": [unsure,unsure],
        "false": [no,yes]
    }
    
    df_real = df[df['condition'] == condition][[*real_ans, *real_conf]].replace(types[type][0])
    df_fake = df[df['condition'] == condition][[*fake_ans, *fake_conf]].replace(types[type][1])
    df_real['ans_sum'] = df_real[[f"{i}_qs-{s}-ans" for i in [*range(1,11)]]].sum(axis=1)
    df_fake['ans_sum'] = df_fake[[f"{i}_qs-{s}-ans" for i in [*range(11,16)]]].sum(axis=1)
    
    for i in range(1, 11):
        df_real[f'{i}-conf'] = df_real[f'{i}_qs-{s}-ans'] * df_real[f'{i}_qs-{s}-conf'].astype(int)
    
    for i in range(11, 16):
        df_fake[f'{i}-conf'] = df_fake[f'{i}_qs-{s}-ans'] * df_fake[f'{i}_qs-{s}-conf'].astype(int)
    
    df_real['conf_sum'] = df_real[[f"{i}-conf" for i in [*range(1,11)]]].sum(axis=1)
    df_fake['conf_sum'] = df_fake[[f"{i}-conf" for i in [*range(11,16)]]].sum(axis=1)
    
    df_real['conf'] = df_real['conf_sum'] / df_real['ans_sum'].replace(0, 1)
    df_fake['conf'] = df_fake['conf_sum'] / df_fake['ans_sum'].replace(0, 1)
    
    # print(df_fake)
    
    return [df_real['conf'], df_fake['conf'], df_real['conf_sum'] + df_fake['conf_sum'] / (df_real['ans_sum'] + df_fake['ans_sum']).replace(0, 1)]

def get_data(type="false"):
    overall_conf = {
        'control': [[], [], []],
        'honest summary': [[], [], []],
        'honest chatbot': [[], [], []],
        'misleading summary': [[], [], []],
        'misleading chatbot': [[], [], []]   
    }

    for source in sources:
        # print(source)
        for condition in conditions:
            score = sum_score(source, condition, type)
            for i in range(3):
                overall_conf[condition][i] += score[i].tolist()
            # overall[condition] += score.tolist()[0:24]
    
    # print(overall_conf)
        
    stats({ x: overall_conf[x][1] for x in overall_conf }, 7)

for type in ['true', 'unsure', 'false']:
    print(type)
    get_data(type)

In [None]:
def sum_score(s, condition):
    df = sources_df[s]
    real_ans = [f"{i}_qs-{s}-ans" for i in [*range(1,11)]]
    fake_ans = [f"{i}_qs-{s}-ans" for i in [*range(11,16)]]
    real_conf = [f"{i}_qs-{s}-conf" for i in [*range(1,11)]]
    fake_conf = [f"{i}_qs-{s}-conf" for i in [*range(11,16)]]
    
    df_real = df[df['condition'] == condition][[*real_ans, *real_conf]].replace({ "1": 1, "2": 0, "3": -1})
    df_fake = df[df['condition'] == condition][[*fake_ans, *fake_conf]].replace({ "1": -1, "2": 0, "3": 1})
    
    for i in range(1, 11):
        df_real[f'{i}-score'] = df_real[f'{i}_qs-{s}-ans'] * df_real[f'{i}_qs-{s}-conf'].astype(int)
    
    for i in range(11, 16):
        df_fake[f'{i}-score'] = df_fake[f'{i}_qs-{s}-ans'] * df_fake[f'{i}_qs-{s}-conf'].astype(int)
    
    df_real['score'] = df_real[[f"{i}-score" for i in [*range(1,11)]]].sum(axis=1)
    df_fake['score'] = df_fake[[f"{i}-score" for i in [*range(11,16)]]].sum(axis=1)
    
    return [df_real['score'], df_fake['score'], df_real['score'] + df_fake['score']]

overall_score = {
    'control': [[], [], []],
    'honest summary': [[], [], []],
    'honest chatbot': [[], [], []],
    'misleading summary': [[], [], []],
    'misleading chatbot': [[], [], []]
}

for source in sources:
    # print(source)
    for condition in conditions:
        score = sum_score(source, condition)
        for i in range(3):
            overall_score[condition][i] += score[i].tolist()
        # overall[condition] += score.tolist()[0:24]

stats({ x: overall_score[x][1] for x in overall_score }, 30)

In [None]:
def get_data_by_source(type="false", source="drug"):
    overall = {
        'control': [[], [], []],
        'honest summary': [[], [], []],
        'honest chatbot': [[], [], []],
        'misleading summary': [[], [], []],
        'misleading chatbot': [[], [], []]   
    }

    for condition in conditions:
        score = sum_ans(source, condition, type)
        print(condition, score[1].tolist())
        for i in range(3):
            overall[condition][i] += [x * 20 for x in score[i].tolist()]
        # overall[condition] += score.tolist()[0:24]
    # print()
        
    # stats({ x: overall[x][1] for x in overall })
    return { x: overall[x][1] for x in overall }


def g(source="pm"):
    cases = {
        'control': [],
        'honest summary': [],
        'honest chatbot': [],
        'misleading summary': [],
        'misleading chatbot': []
    }

    print(source)
    for type in ['true', 'unsure', 'false']:
        print(type)
        res = get_data_by_source(type, source)
        for c in conditions:
            cases[c].append(np.array(res[c]).mean())
    
    return cases

ordered_labels = ['', '', '', '', '']

# X positions
x_pos = np.arange(len(conditions))
width = 0.25  # the width of the bars
multiplier = 0

r0 = [[], [], []]
r1 = [[], [], []]
r2 = [[], [], []]

pm, drug, shoplift = g("pm"), g("drug"), g("shoplift")
    
for c in conditions:
    for j in range(3):
        r0[j].append(pm[c][j])
        r1[j].append(drug[c][j])
        r2[j].append(shoplift[c][j])
            
r = [[np.array(i) for i in r0], [np.array(i) for i in r1], [np.array(i) for i in r2]]

# print(r0[0])
# print(r0[0] + r0[1])

print(r0)
print(r1)
print(r2)

fig, ax = plt.subplots(layout='constrained', figsize=(16, 6))

width = 0.28  # the width of the bars
multiplier = 0
    
i = 0
colors = ["#6D668A", "#5AA09C", "#1392AB"]
colors_light = ["#CECCD4", "#CAD8D7", '#BCD5DA']
for x in [r0, r1, r2]:
    offset = width * multiplier
    ax.bar(x_pos + offset, x[2], width - 0.015, color=colors[i])
    ax.bar(x_pos + offset, x[1], width - 0.015, bottom=x[2], color=colors_light[i])
    # ax.bar(x_pos + offset, x[0], width - 0.01, bottom=np.array(x[2]) + np.array(x[1]), color='#6D668A')
    multiplier += 1
    i += 1

ax.set_xticks(x_pos + width, ['', '', '', '', ''])
ax.set_ylim(0, 100)
    
# ax.ylim(0, 5.2)
# ax.tight_layout()
plt.show()