# XAI Experiment Data Analysis

## Data Analysis todo list

- ✅ Import and preprocess data
- ✅ Demographics general overview 
- ✅ Performance (accuracy etc.)
  - ✅ AI vs Human vs Human-AI
  - ✅ Compare Human-AI performance among groups
  - ✅ Compare change in performance among groups
- ✅ Willingness to adjust judgments to match the AI system (Agreement percentage, switch percentage and AI preference)
  - ✅ Compare agreement percentage among groups
  - ✅ Compare switch percentage among groups
  - ✅ Compare AI preference among groups
- ⚙️ Effects of mistakes of the system (FP and FN)
  - ⚙️ Performance and willingness to adjust at FP and FN compared to other news items
  - ⚙️ Local evaluation metrics of FP and FN compared to other news items
- [ ] Analysis of open questions
  - [ ] What criteria do you usually use to judge whether a news/article is reliable?
  - [ ] What other information would you like to obtain to better assess the truthfulness of an article?
  - [ ] What functionality would be a good addition?

### Bonus analyses todo list
- [ ] Include journalists and interaction effects
  - [ ] Compare Human-AI performance 
  - [ ] Compare change in performance
  - [ ] Compare agreement percentage
  - [ ] Compare switch percentage
  - [ ] Compare AI preference

In [None]:
import json
import pandas as pd
import requests
import io
from urllib.request import urlopen
import json
import logging
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize
import pingouin as pg
import scikit_posthocs as sp

In [None]:
%reload_ext autoreload
%autoreload 2

## Import and preprocess data

### Read questionnaire data

In [None]:
df = pd.read_csv('data/data_cleaned.csv')

### Read news items

In [None]:
newsitems = pd.read_csv('../preprocessing/news-items.csv')

In [None]:
newsitems["id"] = newsitems.index + 1
newsitems.index = newsitems.index + 1

In [None]:
# only keep the relevant columns
newsitems = newsitems[["id", "label", "group",
                       "title", "subtitle", "content",
                       "source", "publishing_date", "category",
                       "highlighted_content", "truthfulness", "readability","natural_language_explanation",
                       "is_qualification"]]

In [None]:
# only keep newsitems from group 2 and newsitems without a group (FP and TN), 
# since only these are used in the experiment
newsitems = newsitems[(newsitems["group"] == 2) | newsitems["group"].isna()]
newsitems = newsitems.drop(columns=["group"])

In [None]:
# create a new column, "is_fake_news", based on if "label" is "FAKE" or "TRUE"
newsitems["is_fake_news"] = newsitems["label"].apply(lambda x: True if x == "FAKE" else False)
newsitems = newsitems.drop(columns=["label"])

In [None]:
# remove percentage sign from truthfulness and convert to int
newsitems["truthfulness"] = newsitems["truthfulness"].apply(lambda x: int(x.replace("%", "")))

In [None]:
newsitems.head()

### Filtering out assumed cheaters

Assume that participants who did answer at least 5 out of 6 control questions correctly were paying attention and gave valid answers.

In [None]:
# participants who did not get 2 points in the qualification shouldn't be considered
# journalists don't have a qualification score, so they should be kept
df = df[(df["POINTS.qualification"].isna()) | (df['POINTS.qualification'] == 2)]

In [None]:
# journalists don't have a main score, so they should be kept
df = df[(df["POINTS.main"].isna()) | (df["POINTS.main"] >= 5)]

In [None]:
# filter out journalists who didn't reach enough points in the merged task
df = df[df["POINTS"] >= 7]

In [None]:
def lay(data):
    return data[data["JOURNALIST"] == False]

In [None]:
# only lay participants are considered
df = lay(df)

## Demographics

In [None]:
# get columns which start with "demographics"
demographics_cols = [col for col in df.columns if col.startswith('demographics')]
demographics_cols
# country and nationality are not relevant, since the study was conducted in the US

In [None]:
df["demographics.age"].value_counts(normalize=True).sort_index() * 100

In [None]:
df["demographics.education"].value_counts(normalize=True) * 100
# university degree is the most common education level, this is surprising

In [None]:
df["demographics.employment"].value_counts(normalize=True) * 100

In [None]:
income_vc = df["demographics.income"].value_counts(normalize=True)
print("less-than-20000-usd", income_vc["less-than-20000-usd"] * 100)
print("20000-34999-usd", income_vc["20000-34999-usd"] * 100)
print("35000-49999-usd", income_vc["35000-49999-usd"] * 100)
print("50000-74999-usd", income_vc["50000-74999-usd"] * 100)
print("75000-99999-usd", income_vc["75000-99999-usd"] * 100)
print("over-100000-usd", income_vc["over-100000-usd"] * 100)
print("no-answer", income_vc["no-answer"] * 100)

In [None]:
df["demographics.gender"].value_counts(normalize=True) * 100

## Define Kruskal-Wallis test function

In [None]:
def highlight(data, highlight_any=False, alpha=0.05):
    columns = ["p-val", "p-unc", "p-corr", "p", "pval"]

    if highlight_any:
        columns = data.columns
        
    return data.style.apply(lambda x: ["text-decoration: underline" if isinstance(v, (int, float)) and v < alpha and c in columns else "" for c, v in zip(x.index, x)], axis=1)

In [None]:
def perform_kruskal_with_posthoc(df, col, print_results=True, between="FEATURE"):
    kruskal_test = pg.kruskal(df, dv=col, between=between, detailed=True)
    kruskal_pval = kruskal_test["p-unc"].values[0]

    # uses Mann–Whitney U test under the hood
    posthoc = pg.pairwise_tests(df, dv=col, between=between, parametric=False, padjust="holm")

    # posthoc = sp.posthoc_dunn(df, val_col=col, group_col=between, p_adjust="holm")

    if print_results:
        print("Column name:", col)
        print("kruskal_pval", kruskal_pval, "\n")
        print(posthoc, "\n")
        print(df.groupby(between)[col].describe()[["mean", "std"]], "\n")

    return kruskal_test, posthoc, df.groupby(between)[col].describe()[["count", "mean", "std"]]

## Performance

### AI vs Human vs Human-AI

How does the AI system perform compared to humans and the combination of humans and AI? 

The task is to judge whether a news item is fake news or not, thus this is a binary classification problem. Since both the AI and participants are asked to rate the news items on a 0-100 scale, we use a threshold of 50, where ratings of <50 are treated as predicted fake news and ratings >=50 are treated as predicted truthful news.

The metrics for the AI are calculated over the 8 news items. Metrics are calculated for each participant separately over the presented news items (e.g. a participant correctly judged 6 out of 8 news items, thus the accuracy for the participant is 6/8 = 0.75). Human-AI performance is measured after presenting the AI rating to the participants, this is also calculated for each participant separately.

### Calculate performance metrics of the AI system

In [None]:
RATING_THRESHOLD = 50

newsitems["is_fake_news_pred"] = newsitems["truthfulness"] < RATING_THRESHOLD

In [None]:
newsitems["is_fake_news"].value_counts()

Since the dataset is unbalanced (5 fake news items and 3 truthful news items), we also use metrics that are more robust to unbalanced datasets than accuracy, such as precision, recall, F1-score and ROC AUC. Furthermore, a Brier score is calculated to measure the accuracy of the predicted probabilities.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, brier_score_loss, log_loss

ai_accuracy = accuracy_score(newsitems["is_fake_news"], newsitems["is_fake_news_pred"])
ai_precision = precision_score(newsitems["is_fake_news"], newsitems["is_fake_news_pred"])
ai_recall = recall_score(newsitems["is_fake_news"], newsitems["is_fake_news_pred"])
ai_f1 = f1_score(newsitems["is_fake_news"], newsitems["is_fake_news_pred"])
ai_roc_auc = roc_auc_score(newsitems["is_fake_news"], 1 - (newsitems["truthfulness"] / 100))
ai_brier_score = brier_score_loss(
    newsitems["is_fake_news"],
    newsitems["truthfulness"] / 100, 
    pos_label=0
)

ai_performance_df = pd.DataFrame({
  "accuracy": [ai_accuracy],
  "precision": [ai_precision],
  "recall": [ai_recall],
  "f1": [ai_f1],
  "roc_auc_score": [ai_roc_auc],
  "brier_score": [ai_brier_score]
})

ai_performance_df

### Calculate performance metrics of the participants and Human-AI teams

In [None]:
def calculate_human_metrics(row):
    y_true = newsitems["is_fake_news"]
    y_pred_human = []
    y_pred_human_prob = []
    y_pred_human_ai = []
    y_pred_human_ai_prob = []

    for i in y_true.index:
        y_pred_human.append(row[f"newsitem.{i}.rating-before-xai"] < 50)
        y_pred_human_prob.append(row[f"newsitem.{i}.rating-before-xai"] / 100)
        y_pred_human_ai.append(row[f"newsitem.{i}.rating-after-xai"] < 50)
        y_pred_human_ai_prob.append(row[f"newsitem.{i}.rating-after-xai"] / 100)

    for i, _id in enumerate(y_true.index):
        # will be used later for comparing news items
        row[f"newsitem.{_id}.rating-before-correct"] = y_true[_id] == y_pred_human[i]
        row[f"newsitem.{_id}.rating-after-correct"] = y_true[_id] == y_pred_human_ai[i]

    human_accuracy = accuracy_score(y_true, y_pred_human)
    human_precision = precision_score(y_true, y_pred_human, zero_division=0)
    human_recall = recall_score(y_true, y_pred_human)
    human_f1 = f1_score(y_true, y_pred_human)
    human_roc_auc = roc_auc_score(y_true, 1 - np.array(y_pred_human_prob))
    human_brier_score = brier_score_loss(y_true, y_pred_human_prob, pos_label=0)

    human_ai_accuracy = accuracy_score(y_true, y_pred_human_ai)
    human_ai_precision = precision_score(y_true, y_pred_human_ai, zero_division=0)
    human_ai_recall = recall_score(y_true, y_pred_human_ai)
    human_ai_f1 = f1_score(y_true, y_pred_human_ai)
    human_ai_roc_auc = roc_auc_score(y_true, 1 - np.array(y_pred_human_ai_prob))
    human_ai_brier_score = brier_score_loss(y_true, y_pred_human_ai_prob, pos_label=0)

    row["human_accuracy"] = human_accuracy
    row["human_precision"] = human_precision
    row["human_recall"] = human_recall
    row["human_f1"] = human_f1
    row["human_roc_auc_score"] = human_roc_auc
    row["human_brier_score"] = human_brier_score

    row["human_ai_accuracy"] = human_ai_accuracy
    row["human_ai_precision"] = human_ai_precision
    row["human_ai_recall"] = human_ai_recall
    row["human_ai_f1"] = human_ai_f1
    row["human_ai_roc_auc_score"] = human_ai_roc_auc
    row["human_ai_brier_score"] = human_ai_brier_score

    return row

df = df.apply(calculate_human_metrics, axis=1)

#### Compare AI vs Human vs Human-AI (without journalists)

To compare the performance of the AI system to the performance of the participants and Human-AI teams, we average the metrics across the participants.

In [None]:
# show mean human scores, mean human_ai scores and ai scores in one dataframe
human_scores = df[[
    "human_accuracy", 
    "human_precision", 
    "human_recall", 
    "human_f1", 
    "human_roc_auc_score",
    "human_brier_score",
]].copy()

human_ai_scores = df[[
    "human_ai_accuracy", 
    "human_ai_precision", 
    "human_ai_recall", 
    "human_ai_f1", 
    "human_ai_roc_auc_score",
    "human_ai_brier_score",
]].copy()

human_scores = human_scores.rename(columns=lambda x: x.replace("human_", ""))
human_ai_scores = human_ai_scores.rename(columns=lambda x: x.replace("human_ai_", ""))

human_scores = human_scores.mean()
human_ai_scores = human_ai_scores.mean()


In [None]:
scores_df = pd.DataFrame({
  "ai": ai_performance_df.iloc[0],
  "human": human_scores,
  "human_ai": human_ai_scores,
})

scores_df

In [None]:
scores_plot_df = pd.melt(
  scores_df.drop(["brier_score"]).reset_index(), 
  id_vars=["index"], 
  value_vars=["ai", "human", "human_ai"]
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

c = sns.catplot(x="variable", y="value", hue="index", data=scores_plot_df,  palette="muted", kind="bar")
plt.title("Performances of AI vs Humans vs Human-AI teams")
plt.ylabel("Score")
plt.xlabel("Model")
c._legend.set_title("Metric")

#### Human vs Human-AI

We use a Wilcoxon signed-rank test to determine if there is a significant difference in participants' performance before and after seeing the AI scores (and explanations).

In [None]:
wilcoxon_test = pg.wilcoxon(df["human_accuracy"], df["human_ai_accuracy"])
highlight(wilcoxon_test)

In [None]:
df[["human_accuracy", "human_ai_accuracy"]].describe().loc[["mean", "std"]]

A significant difference in accuracy before and after seeing the AI score (p < 0.01). The average human accuracy before is 0.730911. The average human accuracy after (= Human-AI accuracy) is 0.818350. (AI accuracy is 0.750000)

#### AI vs Human-AI

We use a Wilcoxon signed-rank test to determine if there is a significant difference between AI performance and Human-AI performance.

In [None]:
ai_accuracy_df = pd.DataFrame({"accuracy": [ai_performance_df["accuracy"][0]] * len(df)})


In [None]:
wilcoxon_test = pg.wilcoxon(ai_accuracy_df["accuracy"], df["human_ai_accuracy"])
highlight(wilcoxon_test)

In [None]:
print(f"Mean Human-AI accuracy: {df[['human_ai_accuracy']].mean().values[0]}")
print(f"Mean AI accuracy: {ai_performance_df[['accuracy']].mean().values[0]}")

The performance of AI-Human teams is significantly higher than the one of the AI system alone (p < 0.01). The average AI accuracy is 0.750000. The average Human-AI accuracy is 0.818350.

### Compare Human-AI performance among groups (v1 vs v2 vs v3)

We use a Kruskal-Wallis H-test to determine if there is a significant difference in the accuracy of the Human-AI teams among the groups.

In [None]:
kruskal, pairwise, stats = perform_kruskal_with_posthoc(df, "human_ai_accuracy", print_results=False);

In [None]:
highlight(kruskal)

In [None]:
stats

A high p-value (0.851942) indicates that there is no significant difference in performance among the groups.

In [None]:
col = "human_ai_accuracy"

accuracy_data = df.groupby("FEATURE")[col].describe()[["mean", "std"]].reset_index()
accuracy_data["FEATURE"] = pd.Categorical(accuracy_data["FEATURE"], categories=["basic", "salient", "explanations"], ordered=True)
accuracy_data = accuracy_data.sort_values("FEATURE")

fig, ax = plt.subplots(figsize=(6, 6))
sns.barplot(x="FEATURE", y="mean", data=accuracy_data, ax=ax, palette="muted", width=.4)
ax.set_ylabel("Mean Human-AI accuracy")
ax.set_xlabel("XAI System Version")
ax.set_ylim(0, 1)
ax.errorbar(x=accuracy_data["FEATURE"], y=accuracy_data["mean"], yerr=accuracy_data["std"], fmt='none', c='black', capsize=5)
ax.set_xticklabels(["Version 1", "Version 2", "Version 3"])
ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)

### Compare the change in performance among groups (v1 vs v2 vs v3)

In [None]:
df["accuracy_change"] = df["human_ai_accuracy"] - df["human_accuracy"]

In [None]:
df["accuracy_change"].describe().loc[["count", "mean", "std"]]

On average, the accuracy of the participants increased by 0.087438 after seeing the AI score. We use a Kruskal-Wallis H-test to determine if there is a significant difference in the change in performance of the participants among the groups.

In [None]:
kruskal, pairwise, stats = perform_kruskal_with_posthoc(df, "accuracy_change", print_results=False);

In [None]:
highlight(kruskal)

In [None]:
stats

There is no significant difference in change in performance among the groups.

## Willingness to adjust judgments to match the AI system

We use three metrics to measure the willingness of the participants to adjust their judgments to match the AI system:
- Agreement percentage: percentage of news items in which the participant’s final prediction agreed with the AI’s prediction (adapted from [1] and [2])
- Switch percentage: percentage of news items in which the participant revised their predictions to match the model’s predictions (adapted from [1] and [2])
- AI-preference: a number between 0 and 1 that indicates the degree to which the participant prefers the AI’s prediction over their own original prediction. This is defined for individual news item ratings of one participant. It is defined as:

$$
ai\_preference = 
\begin{cases}
1 & \text{if } rating_{before} = ai\_rating \text{ and } ai\_rating = rating_{after} \\
0 & \text{if } rating_{before} = ai\_rating \text{ and } ai\_rating \neq rating_{after} \\
\min(1, \max(0, \frac{rating_{after} - rating_{before}}{ai\_rating - rating_{before}})) & \text{otherwise}
\end{cases}
$$

[1]: https://dl.acm.org/doi/10.1145/3290605.3300509 
[2]: https://dl.acm.org/doi/10.1145/3351095.3372852

### Calculate willingness to adjust metrics

In [None]:
def agrees_with_ai(row, newsitem_id):
    ai_rating = newsitems[newsitems["id"] == newsitem_id]["truthfulness"].values[0]
    human_rating = row[f"newsitem.{newsitem_id}.rating-after-xai"]
    return (human_rating < 50) == (ai_rating < 50)

def switched_ratings(row, newsitem_id):
    ai_rating = newsitems[newsitems["id"] == newsitem_id]["truthfulness"].values[0]
    human_rating_before = row[f"newsitem.{newsitem_id}.rating-before-xai"]
    human_rating_after = row[f"newsitem.{newsitem_id}.rating-after-xai"]
    
    return ((human_rating_before < 50) != (human_rating_after < 50)
            ) and ((human_rating_after < 50) == (ai_rating < 50))

def calculate_ai_preference(row, newsitem_id):
    rating_before_col = f"newsitem.{i}.rating-before-xai"
    rating_after_col = f"newsitem.{i}.rating-after-xai"
    ai_rating = newsitems[newsitems["id"] == newsitem_id]["truthfulness"].values[0]

    if row[rating_before_col] == ai_rating:
        if ai_rating == row[rating_after_col]:
            return 1
        else:
            return 0
    else:
        return min(1, max(0, (row[rating_after_col] - row[rating_before_col]) / (ai_rating - row[rating_before_col])))
    
for i in range(7, 15):
    df[f"newsitem.{i}.agrees-with-ai"] = df.apply(lambda row: agrees_with_ai(row, i), axis=1)
    df[f"newsitem.{i}.switched-to-ai"] = df.apply(lambda row: switched_ratings(row, i), axis=1)
    df[f"newsitem.{i}.ai-preference"] = df.apply(lambda row: calculate_ai_preference(row, i), axis=1)

In [None]:
agrees_with_ai_cols = [col for col in df.columns if col.startswith('newsitem') and col.endswith('agrees-with-ai')]
switched_to_ai_cols = [col for col in df.columns if col.startswith('newsitem') and col.endswith('switched-to-ai')]
ai_preference_cols = [col for col in df.columns if col.startswith('newsitem') and col.endswith('ai-preference')]

# calculate average AI preference over all newsitems
df["agrees-with-ai"] = df[agrees_with_ai_cols].mean(axis=1)
df["switched-to-ai"] = df[switched_to_ai_cols].mean(axis=1)
df["ai-preference"] = df[ai_preference_cols].mean(axis=1)

We use Kruskal-Wallis H-tests to determine if there is a significant difference in the willingness of the participants to adjust their judgments among the groups (based on the three metrics). When the test shows a significant difference, we use Mann-Whitney U tests with a Holm correction to determine which groups differ significantly.

### Agreement percentage (v1 vs v2 vs v3)

In [None]:
kruskal, posthoc, stats = perform_kruskal_with_posthoc(df, "agrees-with-ai", print_results=False);

In [None]:
highlight(kruskal)

In [None]:
highlight(posthoc)

In [None]:
stats

There is a significant difference in **agreement percentage** among the groups (Kruskal-Wallis H-test, p=0.005217), the posthoc tests show that the agreement percentage is significantly higher in v3 than in v1 (Mann-Whitney U test, p=0.013883) or v2 (Mann-Whitney U test, p=0.010407). There is no significant difference between the agreement percentage of groups v1 and v2 (Mann-Whitney U test, p=0.864163).

### Switch percentage (v1 vs v2 vs v3)

In [None]:
kruskal, posthoc, stats = perform_kruskal_with_posthoc(df, "switched-to-ai", print_results=False);

In [None]:
highlight(kruskal)

In [None]:
highlight(posthoc)

In [None]:
stats

There is a significant difference in **switch percentage** among the groups (Kruskal-Wallis H-test, p=0.006382), the posthoc tests show that switch percentage is significantly higher in v3 than in v2 (Mann-Whitney U test, p=0.004753). There is no significant difference between the switch percentage of groups v1 and v2 (Mann-Whitney U test, p=0.198357) or v1 and v3 (Mann-Whitney U test, p=0.198357).

### AI-preference (v1 vs v2 vs v3)

In [None]:
kruskal, posthoc, stats = perform_kruskal_with_posthoc(df, "ai-preference", print_results=False);

In [None]:
highlight(kruskal)

In [None]:
highlight(posthoc)

In [None]:
stats

There is a significant difference in **AI-preference** among the groups (Kruskal-Wallis H-test, p=0.00012), the posthoc tests show that AI-preference is significantly higher in v3 than in v1 (Mann-Whitney U test, p=0.011090) or v2 (Mann-Whitney U test, p=0.000113). There is no significant difference between the AI-preference of groups v1 and v2 (Mann-Whitney U test, p=0.125494).

In [None]:
pg.cronbach_alpha(df[["agrees-with-ai", "switched-to-ai", "ai-preference"]])

As the three metrics aim to measure similar concepts, we calculate Cronbach's alpha to determine if the metrics are consistent. The Cronbach's alpha is 0.818663, which indicates that the metrics are consistent.

In [None]:
ai_pref_data = df.groupby("FEATURE")["ai-preference"].describe()[["mean", "std"]].reset_index()
ai_pref_data["FEATURE"] = pd.Categorical(ai_pref_data["FEATURE"], categories=["basic", "salient", "explanations"], ordered=True)
ai_pref_data = ai_pref_data.sort_values("FEATURE")

fig, ax = plt.subplots(figsize=(6, 6))
sns.barplot(x="FEATURE", y="mean", data=ai_pref_data, ax=ax, palette="muted", width=.4)
ax.set_ylabel("Mean AI preference")
ax.set_xlabel("XAI Explanation type")
ax.set_ylim(0, 1)
ax.errorbar(x=ai_pref_data["FEATURE"], y=ai_pref_data["mean"], yerr=ai_pref_data["std"], fmt='none', c='black', capsize=5)
ax.set_xticklabels(["Version 1", "Version 2", "Version 3"])
ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)

## Effects of mistakes of the system

The experiment was designed to intentionally include mistakes of the AI system. We want to investigate if the participants are able to detect these mistakes, if they are willing to adjust their judgments to match the AI system and how they rate the AI-system when looking at the mistakes.

Two types of mistakes are included in the experiment:
- **False positive**: the AI system predicts a news item to be fake, while it is actually truthful (newsitem 14)
- **False negative**: the AI system predicts a news item to be truthful, while it is actually fake (newsitem 13)

In [None]:
newsitems["is_mistake"] = newsitems["is_fake_news"] != newsitems["is_fake_news_pred"]
newsitems["is_false_positive"] = (newsitems["is_fake_news"] == False) & (newsitems["is_fake_news_pred"] == True)
newsitems["is_false_negative"] = (newsitems["is_fake_news"] == True) & (newsitems["is_fake_news_pred"] == False)
newsitems["mistake_type"] = newsitems.apply(lambda row: "FP" if row["is_false_positive"] else "FN" if row["is_false_negative"] else "TP" if row["is_fake_news"] else "TN", axis=1)

In [None]:
pd.set_option('display.max_colwidth', 200)

newsitems[(newsitems["id"] == 13) | (newsitems["id"] == 14)][["title", "subtitle", "content", "truthfulness", "is_fake_news", "is_fake_news_pred", "is_mistake", "is_false_positive", "is_false_negative", "mistake_type"]]

In [None]:
newsitem_eval_df = pd.DataFrame()

for i in range(7, 15):
    newsitem_cols = [
                     f"newsitem.{i}.system-evaluation.classified-correctly",
                     f"newsitem.{i}.system-evaluation.explanations-comprehensible-and-help-assess",
                     f"newsitem.{i}.system-evaluation.indications-useful",
                     f"newsitem.{i}.system-evaluation.understand-what-system-does",
                     f"newsitem.{i}.system-evaluation.xai-features-useful",
                     f"newsitem.{i}.rating-before-correct",
                     f"newsitem.{i}.rating-after-correct",
                     f"newsitem.{i}.ai-preference",
                     f"newsitem.{i}.switched-to-ai",
                     "FEATURE"
                    ]
    newsitem_eval_df = pd.concat([newsitem_eval_df, df[newsitem_cols].rename(columns={col: col.split(".")[-1] for col in newsitem_cols}).assign(newsitem=i)])

# make "newsitem" the first column
cols = newsitem_eval_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
newsitem_eval_df = newsitem_eval_df[cols]

In [None]:
# join newsitem_eval_df with newsitems to get the "is_mistake" column
newsitem_eval_df = newsitem_eval_df.merge(newsitems[["id", "is_fake_news", "is_mistake", "is_false_positive", "is_false_negative", "mistake_type"]], left_on="newsitem", right_on="id").drop(columns=["id"])

measures:
- on newsitem level: accuracy, change in accuracy, agreement percentage, switch percentage
- on single rating level: ai-preference, local evaluation metrics

### Is rating the newsitems equally difficult?

We use a chi-squared test to determine if there is a significant difference in correct and incorrect ratings among the news items. We look at ratings before seeing the AI score. 

In [None]:
# perform chi2 test
expected, observed, stats = pg.chi2_independence(newsitem_eval_df, x='newsitem', y='rating-before-correct')

In [None]:
highlight(stats)

The low p values (<0.01) show us that the newsitems are unequally difficult to rate.

### Accuracy on newsitems before and after seeing the AI score

In [None]:
# calculate average performance over newsitems
performance_df = newsitem_eval_df.groupby(["newsitem"])[["rating-before-correct", "rating-after-correct"]].mean().reset_index()

In [None]:
performance_df["accuracy-diff"] = performance_df["rating-after-correct"] - performance_df["rating-before-correct"]

In [None]:
# join newsitems with performance_df to add the rating-before-correct and rating-after-correct columns
newsitems = newsitems.merge(performance_df, left_on="id", right_on="newsitem").drop(columns=["newsitem"])

In [None]:
# visualize performance

newsitems = newsitems.sort_values("rating-before-correct")

# transform data to long format
data = newsitems[["id", "rating-before-correct", "rating-after-correct"]].melt(id_vars=["id"], var_name="measure", value_name="rating")
sorted_ids = data[data["measure"] == "rating-before-correct"].sort_values("rating", ascending=False)["id"].unique()

fig, ax = plt.subplots(figsize=(6, 6))
sns.barplot(x="id", y="rating", 
  hue="measure", 
  data=data, 
  ax=ax, 
  palette="muted", 
  order=sorted_ids)

# rename legend labels
handles, labels = ax.get_legend_handles_labels()
# put legend outside of plot
ax.legend(handles=handles, labels=["Before XAI", "After XAI"], title="Accuracy", bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


ax.set_ylabel("Accuracy before and after XAI")
ax.set_xlabel("Newsitem")

# set the ticks belonging to ids 13 and 14 to red
for tick in ax.get_xticklabels():
    if int(tick.get_text()) in [13, 14]:
        tick.set_color("red")

ticks = [f"{_id}\n{newsitems[newsitems['id'] == _id].mistake_type.values[0]}" for _id in sorted_ids]
ax.set_xticklabels(ticks)
ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)


The graph shows that in the case of all newsitems, the accuracy of the participants increased after seeing the AI score, except for the system mistakes where it decreased.

Next we look at the change in accuracy.

In [None]:
blue = "#4C72B0"
red = "#DB5F57"

fig, ax = plt.subplots(figsize=(6, 6))
sns.barplot(x="id", y="accuracy-diff", data=newsitems, ax=ax, palette="muted", width=.5, order=newsitems.sort_values("accuracy-diff", ascending=False)["id"])
ax.set_ylabel("Accuracy difference after and before XAI")
ax.set_xlabel("Newsitem")

ticks = [f"{item.id}\n{item.mistake_type}" for i, item in newsitems.sort_values("accuracy-diff", ascending=False).iterrows()]
ax.set_xticklabels(ticks)

# set the colors of the bars based on their value
for i, bar in enumerate(ax.patches):
  if bar.get_height() > 0:
    bar.set_color(blue)
  else:
    bar.set_color(red)

# set the last two ticks to red
ax.get_xticklabels()[-2].set_color("red")
ax.get_xticklabels()[-1].set_color("red")

ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)


In [None]:
newsitems.groupby("is_mistake")["accuracy-diff"].describe()

In [None]:
mwu_test = pg.mwu(
    newsitems[newsitems["is_mistake"] == True]["accuracy-diff"],
    newsitems[newsitems["is_mistake"] == False]["accuracy-diff"]
)

highlight(mwu_test)

The MWU test shows that there is no significant difference in the change in accuracy between the newsitems (p=0.071429). **This test might not be relevant because of the low number of items (2 mistakes, 6 no mistakes)**

We perform a chi-squared test to determine if there is a significant difference in the number of correct and incorrect ratings among the news items after seeing the AI score among newsitems where the AI made a mistake vs newsitems where the AI did not make a mistake.

In [None]:
# perform chi2 test
expected, observed, stats = pg.chi2_independence(newsitem_eval_df, x='is_mistake', y='rating-after-correct')

In [None]:
highlight(stats)

In [None]:
newsitem_eval_df.groupby("is_mistake")["rating-after-correct"].mean()

### Define MWU

In [None]:
def perform_mwu(df, dv, between):
    # get both possible values of the between variable
    values = df[between].unique()

    if len(values) != 2:
        raise ValueError("The between variable should have exactly two possible values")
    
    # check if dv is a numeric variable
    value = df[dv].values[0]
    is_numeric = isinstance(value, (int, float, complex))
    
    if not is_numeric:
        raise ValueError("The dependent variable should be numeric")

    mwu_test = pg.mwu(
        df[df[between] == values[0]][dv],
        df[df[between] == values[1]][dv]
    )

    stats = df.groupby(between)[dv].describe()[["count", "mean", "std"]]

    return mwu_test, stats

### AI-preference on newsitems

In [None]:
ai_preference_means = newsitem_eval_df.groupby("newsitem")["ai-preference"].mean().reset_index()

In [None]:
sorted_ids = ai_preference_means.sort_values("ai-preference", ascending=False)["newsitem"].unique()

fig, ax = plt.subplots(figsize=(6, 6))
sns.barplot(
  x="newsitem", 
  y="ai-preference", 
  data=ai_preference_means, 
  ax=ax, 
  palette="muted", 
  width=.5, 
  order=sorted_ids
)

ticks = [f"{_id}\n{newsitems[newsitems['id'] == _id].mistake_type.values[0]}" for _id in sorted_ids]
ax.set_xticklabels(ticks)

ax.set_ylabel("AI-preference")
ax.set_xlabel("Newsitem")

# set the colors of the bars based on their value
for i, bar in enumerate(ax.patches):
  if bar.get_height() > 0:
    bar.set_color(blue)
  else:
    bar.set_color(red)

# set the last two ticks to red
ax.get_xticklabels()[-2].set_color("red")
ax.get_xticklabels()[-1].set_color("red")

ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)

In [None]:
mwu, stats = perform_mwu(newsitem_eval_df, dv="ai-preference", between="is_mistake")

In [None]:
highlight(mwu)

In [None]:
stats

### Local evaluation of newsitems

In [None]:
local_eval_cols = [
    'classified-correctly', # The AI-System classified the news items correctly
    'explanations-comprehensible-and-help-assess', # The presented explanations are comprehensible and help me with assessing the news articles
    'indications-useful', # The indications given by the AI-System are useful to assess the truthfulness of the news article
    'understand-what-system-does', # I understand what the AI-System does
    'xai-features-useful' # The explainability features presented are useful to assess the truthfulness of the news article
]

In [None]:
local_eval_means = newsitem_eval_df.groupby('newsitem')[local_eval_cols].mean().reset_index()

In [None]:
chart_data = local_eval_means.melt(id_vars=["newsitem"], var_name="measure", value_name="rating")

In [None]:
sorted_ids = chart_data[chart_data["measure"] == "classified-correctly"].sort_values("rating", ascending=False)["newsitem"].unique()

fig, ax = plt.subplots()
sns.barplot(
  x="newsitem", 
  y="rating", 
  hue="measure", 
  data=chart_data, 
  ax=ax, 
  palette="muted",
  order=sorted_ids
  )

# put legend outside of the plot
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

ax.set_ylabel("Mean local evaluation metrics")
ax.set_xlabel("Newsitem")

# set y scale to 0-7
ax.set_ylim(1, 7)

# set the ticks belonging to ids 13 and 14 to red
for tick in ax.get_xticklabels():
    if int(tick.get_text()) in [13, 14]:
        tick.set_color("red")

ticks = [f"{_id}\n{newsitems[newsitems['id'] == _id].mistake_type.values[0]}" for _id in sorted_ids]
ax.set_xticklabels(ticks)

ax.set_axisbelow(True)
ax.yaxis.grid(True, which='major', color='grey', alpha=.25)


In [None]:
summary_data = []

for col in local_eval_cols:
    mwu, stats = perform_mwu(newsitem_eval_df, dv=col, between="is_mistake")
    summary_data.append({
        "measure": col,
        "mwu": mwu["U-val"].values[0],
        "mwu-p": mwu["p-val"].values[0],
        "mean-no-mistake": stats.loc[False, "mean"],
        "std-no-mistake": stats.loc[False, "std"],
        "mean-mistake": stats.loc[True, "mean"],
        "std-mistake": stats.loc[True, "std"],
    })

summary_df = pd.DataFrame(summary_data)

In [None]:
highlight(summary_df, highlight_any=True)

## Analysis of open questions

In [None]:
# TODO