# XAI Experiment Data Analysis - Journalist data

This notebook contains attempts at including the journalist data and using Shreirer-Ray-Hare tests to compare groups and interaction effects.

In [None]:
import json
import pandas as pd
import requests
import io
from urllib.request import urlopen
import json
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize
import pingouin as pg
import scikit_posthocs as sp

In [None]:
df = pd.read_csv('data/data_cleaned.csv')

# participants who did not get 2 points in the qualification shouldn't be considered
# journalists don't have a qualification score, so they should be kept
df = df[(df["POINTS.qualification"].isna()) | (df['POINTS.qualification'] == 2)]

# journalists don't have a main score, so they should be kept
df = df[(df["POINTS.main"].isna()) | (df["POINTS.main"] >= 5)]

# filter out journalists who didn't reach enough points in the merged task
df = df[df["POINTS"] >= 7]

In [None]:
df["FEATURE:JOURNALIST"] = df["FEATURE"] + " " + np.where(df["JOURNALIST"], "journalist", "lay")

In [None]:
df["FEATURE:JOURNALIST"].value_counts()

## Define functions

In [None]:
from scipy import stats

# from https://github.com/jpinzonc/Scheirer-Ray-Hare-Test
def srh(data: pd.DataFrame, dv: str, between1: str, between2: str):
    data = data.copy()
    data["rank"] = data[dv].sort_values().rank(numeric_only=True)

    rows = data.groupby([between1], as_index = False).agg({'rank':['count', 'mean', 'var']}).rename(columns={'rank':'row'})
    rows.columns = ['_'.join(col) for col in rows.columns]
    rows.columns = rows.columns.str.replace(r'_$',"", regex=True)
    rows['row_mean_rows'] = rows.row_mean.mean()
    rows['sqdev'] = (rows.row_mean - rows.row_mean_rows)**2

    cols = data.groupby([between2], as_index = False).agg({'rank':['count', 'mean', 'var']}).rename(columns={'rank':'col'})
    cols.columns = ['_'.join(col) for col in cols.columns]
    cols.columns = cols.columns.str.replace(r'_$',"", regex=True)
    cols['col_mean_cols'] = cols.col_mean.mean()
    cols['sqdev'] = (cols.col_mean-cols.col_mean_cols)**2

    data_sum         = data.groupby([between1, between2], as_index = False).agg({'rank':['count', 'mean', 'var']})
    data_sum.columns = ['_'.join(col) for col in data_sum.columns]
    data_sum.columns = data_sum.columns.str.replace(r'_$',"", regex=True)

    nobs_row   = rows.row_count.mean()
    nobs_total = rows.row_count.sum()
    nobs_col   = cols.col_count.mean()

    columns_ss = cols.sqdev.sum()*nobs_col
    rows_ss    = rows.sqdev.sum()*nobs_row
    within_ss  = data_sum.rank_var.sum()*(data_sum.rank_count.min()-1)
    ms         = data['rank'].var()
    total_ss   = ms * (nobs_total-1)
    inter_ss   = total_ss - within_ss - rows_ss - columns_ss

    h_rows = rows_ss/ms
    h_cols = columns_ss/ms
    h_int  = inter_ss/ms

    df_rows   = len(rows)-1
    df_cols   = len(cols)-1
    df_int    = df_rows*df_cols
    df_total  = len(data)-1
    df_within = df_total - df_int - df_cols - df_rows

    p_rows  = 1-stats.chi2.cdf(h_rows, df_rows)
    p_cols  = 1-stats.chi2.cdf(h_cols, df_cols)
    p_inter = 1-stats.chi2.cdf(h_int, df_int)

    # put the results in a dataframe
    results = pd.DataFrame({'Source':[between1, between2, f'{between1}:{between2}', 'Within', 'Total'],
                            'SS':[rows_ss, columns_ss, inter_ss, within_ss, total_ss],
                            'df':[df_rows, df_cols, df_int, df_within, df_total],
                            'MS':[np.nan, np.nan, np.nan, np.nan, total_ss/df_total],
                            'H':[h_rows, h_cols, h_int, np.nan, np.nan],
                            'p':[p_rows, p_cols, p_inter, np.nan, np.nan]})
    
    return results

In [None]:
def perform_srh_with_posthoc(df, col, print_results=True, highlight_p=True, between1="FEATURE", between2="JOURNALIST"):
    srh_test = srh(df, dv=col, between1=between1, between2=between2)

    assert between1 == "FEATURE" and between2 == "JOURNALIST", "between1 and between2 should be FEATURE and JOURNALIST"
    posthoc1 = sp.posthoc_dunn(df, val_col=col, group_col=between1, p_adjust="holm")
    posthoc2 = sp.posthoc_dunn(df, val_col=col, group_col=between2, p_adjust="holm")
    posthoc_interaction = sp.posthoc_dunn(df, val_col=col, group_col=f"{between1}:{between2}", p_adjust="holm")

    if print_results:
        print("Column name:", col)
        print(srh_test, "\n")
        print(posthoc1, "\n")
        print(posthoc2, "\n")
        print(posthoc_interaction, "\n")
        print(df.groupby(between1)[col].describe()[["mean", "std"]], "\n")
        print(df.groupby(between2)[col].describe()[["mean", "std"]], "\n")
        print(df.groupby([between1, between2])[col].describe()[["mean", "std"]], "\n")

    return srh_test, posthoc1, posthoc2, posthoc_interaction

### Compare Human-AI performance among groups and layperson vs journalists (including interaction effects)

In [None]:
srh_test, posthoc1, posthoc2, posthoc_i = perform_srh_with_posthoc(df, "human_ai_accuracy", print_results=False)

In [None]:
srh_test

In [None]:
posthoc1

In [None]:
posthoc2

In [None]:
posthoc_i

### Compare change in performance among groups and layperson vs journalists (including interaction effects)

In [None]:
srh_test, posthoc1, posthoc2, posthoc_i = perform_srh_with_posthoc(df, "accuracy_change", print_results=False)

In [None]:
srh_test

In [None]:
posthoc1

In [None]:
posthoc2

In [None]:
df.groupby("JOURNALIST").describe()["accuracy_change"]

In [None]:
posthoc_i

### Agreement percentage (v1 vs v2 vs v3) and layperson vs journalists (including interaction effects)

In [None]:
srh_test, posthoc1, posthoc2, posthoc_i = perform_srh_with_posthoc(df, "agrees-with-ai", print_results=False)

In [None]:
srh_test

In [None]:
posthoc1

In [None]:
posthoc2

In [None]:
posthoc_i

### Switch percentage (v1 vs v2 vs v3) and layperson vs journalists (including interaction effects)

In [None]:
srh_test, posthoc1, posthoc2, posthoc_i = perform_srh_with_posthoc(df, "switched-to-ai", print_results=False)

In [None]:
srh_test

In [None]:
posthoc1

In [None]:
df.groupby("FEATURE").describe()["switched-to-ai"]

In [None]:
posthoc2

In [None]:
df.groupby("JOURNALIST").describe()["switched-to-ai"]

In [None]:
posthoc_i

### AI-preference (v1 vs v2 vs v3) and layperson vs journalists (including interaction effects)

In [None]:
srh_test, posthoc1, posthoc2, posthoc_i = perform_srh_with_posthoc(df, "ai-preference", print_results=False)

srh_test

In [None]:
posthoc1

In [None]:
posthoc2

In [None]:
posthoc_i