# Experiment plots

Executing this notebook requires reading `.png` files that illustrate the summaries of the experiment's trials. Each trial's summary is given in `experiments/experiment1/trials/{trial_number}/summary.png` and joint summaries of twin trials are given in `experiments/experiment1/trials/{trial_number}/summary_diff.png` of odd trial numbers. For example, `experiments/experiment1/trials/1/summary.png` contains a joint summary of trials 1 and 2. 

The generated plots can be found in the `figures` directory.

In [1]:
import os
os.chdir("../../code")

In [2]:
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
import glob
import json
import scipy.stats as stats
sns.set_theme(context='paper', style='ticks', font_scale=1)

from python import utils

In [3]:
name="experiment1"
width_pt = 469
seed = 42
rng = np.random.default_rng(seed=seed)
humancol = "#762c6c"
aicol = "#ffa000"
palette = sns.color_palette('husl', 5)

The following block reads the responsibility judgment data collected from the experiment's participants.

In [4]:
os.chdir("../")
file = glob.glob("./data/{name}/counterfactual_sequential_exp1-trials.csv".format(name=name))
df = pd.read_csv(file[0])

### Difference of human responsibility per pair of twin trials (Figure 2)

The following blocks generate plots that illustrate the difference in responsibility judgments about the human in each pair of twin trials.

In [28]:
same_cf_trials = range(1, 16, 2)
diff_df = df[['workerid', 'trial', 'human']].copy()
diff_df = diff_df[diff_df['trial'] != 0]
diff_df.sort_values(by=['workerid', 'trial'], inplace=True)
# make a new column 'CF', if trial odd then 1 else 0,
diff_df['CF'] = diff_df['trial'].apply(lambda x: 1 if x in same_cf_trials else 0)
# replace every even trial number with the previous trial number
diff_df['trial'] = diff_df['trial'].apply(lambda x: x-1 if x not in same_cf_trials else x)

In [29]:
trial_ids = diff_df['trial'].unique()

for trial in trial_ids:
    # filter/transform the dataframe
    temp_df = diff_df[diff_df['trial'] == trial].copy()
    # add jitter
    temp_df['CF_jitter'] = (temp_df['CF']).astype(float) + rng.uniform(-0.1, 0.1, len(temp_df))

    utils.latexify() # Computer Modern, with TeX
    fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.6)
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(fig_width, 0.5*fig_height+fig_width), gridspec_kw={'height_ratios': [fig_width, 0.5*fig_height]})

    plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=0.985, wspace=0.0, hspace=0.0)

    # draw summary image axis
    img = imread('./code/experiments/experiment1/trials/{trial}/summary_diff.png'.format(trial=trial))
    axes[0].imshow(img)
    axes[0].axis('off')

    left, bottom, width, height = axes[1].get_position().bounds
    padding = 0.2
    new_left = left + padding / 2
    new_width = width - padding
    axes[1].set_position([new_left, bottom, new_width, height])

    # draw plot axis
    sns.lineplot(data=temp_df, x='CF_jitter', y='human', hue='workerid', marker='o',
                  palette=['lightgray'] * len(temp_df['workerid'].unique()), alpha=.4, linewidth=0.9, legend=False, ax=axes[1], zorder=1)
    sns.despine(ax=axes[1])

    mean_CFF = temp_df[temp_df['CF'] == 0]['human'].mean()
    mean_CFT = temp_df[temp_df['CF'] == 1]['human'].mean()

    plt.setp(axes[1].lines, zorder=0)
    # add mean points
    sns.pointplot(data=temp_df, x='CF', y='human', join=False, capsize=0.1,
                  palette=[palette[0], palette[2]], markers=['o', 'o'], ax=axes[1], scale = 1.3)

    
    axes[1].set_ylim([-5,105])
    axes[1].set_xlim([-0.4, 1.4])
    if trial == 7:
        axes[1].set_ylabel("Human responsibility")
    else:
        axes[1].set_ylabel("")
    axes[1].set_xlabel("Counterfactual outcome")
    axes[1].set_xticks([0, 1])
    axes[1].set_xticklabels(['Failure', 'Success'])

    fig.savefig('figures/experiment1/human_diff_trial_{trial}.pdf'.format(trial=trial), dpi=300)
    plt.close()

### Effect of the human's decision quality on responsibility judgments (Figure 3)

The following blocks generate a plot that illustrates the distribution of the difference in responsibility judgments $\Delta_H$ about the human in twin trials where the human made a right and wrong decision, respectively. This is Figure 3(b) in the paper.

In [30]:
# merge every two rows and set human to the difference between row1['human'] and row2['human']
diff_df['diff'] = -diff_df.groupby(['workerid', 'trial'])['human'].diff()
diff_df.dropna(inplace=True)
diff_df.drop(columns=['human', 'CF'], inplace=True)
diff_df['right'] = diff_df['trial'].apply(lambda x: 'right' if x in [1,5,9,13] else 'wrong')

In [31]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.45)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_height))

# draw boxplot of diff for right and wrong trials
trick_df = diff_df.copy()
# replace right with 1 and wrong with 0
trick_df['decision'] = trick_df['right'].replace({'right': 0, 'wrong': 1})
trick_df['dummy'] = 0
sns.violinplot(data=trick_df, x='diff', hue='decision', ax=ax, palette=[palette[1], palette[3]], inner=None, split=True, orient='h', y='dummy')
# show the two means as vertical lines inside the violinplots
ax.axvline(x=trick_df[trick_df['decision'] == 1]['diff'].mean(), ymin=0, ymax=0.5, color='#4c4c4c', linestyle='--')
ax.axvline(x=trick_df[trick_df['decision'] == 0]['diff'].mean(), ymin=0.5, ymax=1, color='#4c4c4c', linestyle='--')
# compute 95% confidence intervals for the two means
ci_right = stats.t.interval(0.95, len(trick_df[trick_df['decision'] == 0]['diff'])-1, loc=np.mean(trick_df[trick_df['decision'] == 0]['diff']), scale=stats.sem(trick_df[trick_df['decision'] == 0]['diff']))
ci_wrong = stats.t.interval(0.95, len(trick_df[trick_df['decision'] == 1]['diff'])-1, loc=np.mean(trick_df[trick_df['decision'] == 1]['diff']), scale=stats.sem(trick_df[trick_df['decision'] == 1]['diff']))
# print the two means and their confidence intervals
print("right: ", trick_df[trick_df['decision'] == 0]['diff'].mean(), ci_right)
print("wrong: ", trick_df[trick_df['decision'] == 1]['diff'].mean(), ci_wrong)
# add the 95% confidence intervals as ribbons
ax.fill_betweenx([0, -0.5], ci_right[0], ci_right[1], color='#4c4c4c', alpha=0.3, linewidth=0.0)
ax.fill_betweenx([0, 0.5], ci_wrong[0], ci_wrong[1], color='#4c4c4c', alpha=0.3, linewidth=0.0)

sns.despine(ax=ax)

ax.set_xlabel(r"Difference in human responsibility, $\Delta_H$")
ax.set_ylabel("Switching decision")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ['Right', 'Wrong'], title=None, loc='upper right', bbox_to_anchor=(1, 1.1))
ax.set_yticks([])

fig.tight_layout()
fig.savefig('figures/experiment1/human_diff_good_bad.pdf', dpi=300)
plt.close()

right:  2.81 (-1.52907883240101, 7.14907883240101)
wrong:  11.605 (7.242689165947417, 15.967310834052583)


The following blocks generate a plot that shows the average responsibility attribute to the human and the AI across trials where the human made a right and wrong decision, respectively. This is Figure 3(a) in the paper.

In [33]:
good_df = df[['workerid', 'trial', 'human', 'ai']].copy()
good_df = good_df[good_df['trial']!=0]
good_df.sort_values(by=['workerid', 'trial'], inplace=True)
good_df['decision'] = good_df['trial'].apply(lambda x: 0 if x in [1,2,5,6,9,10,13,14] else 1)
good_df = good_df.melt(id_vars=['workerid','trial','decision'], value_vars=['ai', 'human'], var_name='agent', value_name='resp')

In [34]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.45)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_height))

# draw barplot with error bars showing the mean ai and human judgments
sns.barplot(data=good_df, x='decision', y='resp', hue='agent', ax=ax, palette=[aicol, humancol], errorbar=('ci', 95), alpha=0.7)
sns.despine(ax=ax)

ax.set_xlabel("Switching decision")
ax.set_ylabel("Avg. responsibility")
ax.set_xticklabels(['Right', 'Wrong'])
ax.set_ylim([0, 115])

# set the legend title to 'Agent' and the two colors to 'AI' and 'Human'
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ['AI', 'Human'], title='Agent', bbox_to_anchor=(0.0, 1.1), loc='upper left', ncol=2)

fig.tight_layout()
fig.savefig('figures/experiment1/ai_human_good_bad.pdf', dpi=300)
plt.close()


### Evaluation of responsibility models (Figure 4)

The following block reads the logs of the Monte Carlo simulations performed in the grid of each trial and saves to a `.csv` the quantities required for the upcoming analysis.

**Requirement:** The code below assumes that the simulations are already performed. Those can be executed using the scripts `generate_counterfactual_decisions.sh`, `generate_counterfactual_episodes.sh` and `count_factual_path_length.sh` found in `code/bash/`. The logs for each grid (`world`) are stored in `code/resources/episodes/{world}/`.

**Output:** For each combination of trial, scaler $\tau$, and scaler $\theta$, the `csv` contains the counterfactual probability of success without the AI (`prob_succ_no_ai`), the counterfactual probability of success with a different switching decision made by the human (`prob_succ_other_decision`), the probability of making the non-observed switching decision (`prob_other_decision`) and the step count that the human (`human_count`) and the AI (`ai_count`) drove respectively.

In [35]:
# read simulation reports
worlds=["world43", "world44", "world45", "world46", "world47", "world48", "world57", "world58", "world49", "world50", "world55", "world56", "world51", "world52", "world53", "world54"]
trials=["trial1", "trial2", "trial3", "trial4", "trial5", "trial6", "trial7", "trial8", "trial9", "trial10", "trial11", "trial12", "trial13", "trial14", "trial15", "trial16"]
simscalers=[0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]     # this is the policy temperature
softscalers=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]    # this is the switching decision temperature
switch_episodes = [5,6,7,8,13,14,15,16]
ai_starts = [1,2,3,4,5,6,7,8]

sim_results = []

for ind, world in enumerate(worlds):
    trial = trials[ind]
    for softscaler in softscalers:
        for simscaler in simscalers:
            trial_num = int(trial.split('trial')[1])
            
            # read counterfactual episode report
            file = glob.glob('code/resources/episodes/{world}/cfreport:{world}_humanscaler:{simscaler}_aiscaler:{simscaler}.json'.format(world=world, simscaler=simscaler))
            report = json.load(open(file[0]))
            prob_succ_no_ai = np.mean(report['successes'])

            results = {
                'trial' : trial_num,
                'tau_scaler' : simscaler,
                'theta_scaler' : softscaler,
                'prob_succ_no_ai' : prob_succ_no_ai
            }

            # read counterfactual decision reports
            if trial_num in switch_episodes:
                file = glob.glob('code/resources/episodes/{world}/cfnoreport:{world}_humanscaler:{simscaler}_aiscaler:{simscaler}.json'.format(world=world, simscaler=simscaler))
                report = json.load(open(file[0]))
                prob_succ_no_switch = np.mean(report['successes'])
                results['prob_succ_other_decision'] = prob_succ_no_switch
            else:
                file = glob.glob('code/resources/episodes/{world}/cfyesreport:{world}_humanscaler:{simscaler}_aiscaler:{simscaler}.json'.format(world=world, simscaler=simscaler))
                report = json.load(open(file[0]))
                prob_succ_switch = np.mean(report['successes'])
                results['prob_succ_other_decision'] = prob_succ_switch

            # read heuristic step count reports
            file = glob.glob('code/resources/episodes/{world}/heurreport:{world}.json'.format(world=world, simscaler=simscaler))
            report = json.load(open(file[0]))
            results['ai_count'] = report['ai_count']
            results['human_count'] = report['human_count']

            # read human monte carlo probability estimates reports
            file = glob.glob('code/resources/episodes/{world}/humanprobs:{world}_simscaler:{simscaler}_semimanualseed:42.json'.format(world=world, simscaler=simscaler))
            report = json.load(open(file[0]))
            human_prob = report['human_score']
            ai_prob = report['ai_score']
            if trial_num in switch_episodes and trial_num in ai_starts:
                results['prob_other_decision'] = np.exp(softscaler*ai_prob) / (np.exp(softscaler*ai_prob) + np.exp(softscaler*human_prob))
            elif trial_num in switch_episodes and trial_num not in ai_starts:
                results['prob_other_decision'] = np.exp(softscaler*human_prob) / (np.exp(softscaler*ai_prob) + np.exp(softscaler*human_prob))
            elif trial_num not in switch_episodes and trial_num in ai_starts:
                results['prob_other_decision'] = np.exp(softscaler*human_prob) / (np.exp(softscaler*ai_prob) + np.exp(softscaler*human_prob))
            elif trial_num not in switch_episodes and trial_num not in ai_starts:
                results['prob_other_decision'] = np.exp(softscaler*ai_prob) / (np.exp(softscaler*ai_prob) + np.exp(softscaler*human_prob))
            
            sim_results.append(results)
        
sim_df = pd.DataFrame(sim_results)

In [36]:
# save the simulation results
sim_df.to_csv('code/analysis/files/simulation_results.csv', index=False)

### Model fitting performed in R

Before proceeding to the next block, make sure that the file `code/analysis/files/trial_means.csv` exists. This contains the predictions of responsibility judgments for the human and the AI in each trial, made by (i) simple models relying on the actual contribution of each agent to the outcome, (ii) extensions of models in (i) that also take into account the difficulty of each trial and (iii) our proposed responsibility models.

To create the file `trial_means.csv`, one needs to execute the R script `code/analysis/files/model_eval.R` which uses the package `brms` to fit Bayesian Linear Mixed Effects models. It uses optimal values for the scaling parameters $\tau$ and $\theta$ selected via grid search (see `code/analysis/files/grid_search.R`).

In [38]:
reg_df = df[df['trial']!=0].copy()

# read trial_means.csv
trial_means = pd.read_csv('code/analysis/files/trial_means.csv')

# join trial_means with reg_df by matching on trial
reg_df = reg_df.merge(trial_means, on='trial')

### Generate plots of Figure 4

In [39]:
# for each trial, compute mean and 95% confidence interval of human, ai, heur_human, heur_ai
trial_stats = reg_df.groupby('trial').agg({'human': ['mean', lambda x: stats.sem(x) * 1.96], 'ai': ['mean', lambda x: stats.sem(x) * 1.96], 'heur_human': 'mean', 'heur_ai': 'mean'})
trial_stats.columns = ['human_mean', 'human_ci', 'ai_mean', 'ai_ci', 'heur_human_mean', 'heur_ai_mean']
trial_stats.reset_index(inplace=True)

#### Human responsibility: actual contribution

In [40]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.4)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_width))

# draw scatterplot showing the mean ai and human judgments
sns.scatterplot(data=trial_stats, x='heur_human_mean', y='human_mean', color=humancol, ax=ax, s=50, alpha=0.5)
ax.errorbar(x=trial_stats['heur_human_mean'], y=trial_stats['human_mean'], yerr=trial_stats['human_ci'], fmt='none', color=humancol, capsize=0, alpha=0.3)
sns.despine(ax=ax)

# draw a line from (0,0) to (100,100)
ax.plot([0, 100], [0, 100], color='#4c4c4c', linestyle='--')

# compute pearson correlation coefficient
r, p = stats.pearsonr(trial_stats['heur_human_mean'], trial_stats['human_mean'])
# compute RMSE
rmse = np.sqrt(np.mean((trial_stats['heur_human_mean'] - trial_stats['human_mean'])**2))
# add the RMSE to the plot
ax.annotate("RMSE={rmse:.2f}".format(rmse=rmse), xy=(0.05, 0.9), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))
# add the correlation coefficient to the plot
ax.annotate("r={r:.2f}".format(r=r), xy=(0.05, 0.8), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))

ax.set_xlabel("Actual contribution")
ax.set_ylabel("Human responsibility", color=humancol)
ax.set_ylim([0, 100])
ax.set_xlim([0, 100])

fig.tight_layout()
fig.savefig('figures/experiment1/reg_heur_human.pdf', dpi=300)
plt.close()


#### AI responsibility: actual contribution

In [41]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.4)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_width))

# draw scatterplot showing the mean ai and human judgments
sns.scatterplot(data=trial_stats, x='heur_ai_mean', y='ai_mean', color=aicol, ax=ax, s=50, alpha=0.5)
ax.errorbar(x=trial_stats['heur_ai_mean'], y=trial_stats['ai_mean'], yerr=trial_stats['ai_ci'], fmt='none', color=aicol, capsize=0, alpha=0.3)
sns.despine(ax=ax)

# draw a line from (0,0) to (100,100)
ax.plot([0, 100], [0, 100], color='#4c4c4c', linestyle='--')

# compute pearson correlation coefficient
r, p = stats.pearsonr(trial_stats['heur_ai_mean'], trial_stats['ai_mean'])
# compute RMSE
rmse = np.sqrt(np.mean((trial_stats['heur_ai_mean'] - trial_stats['ai_mean'])**2))
# add the RMSE to the plot
ax.annotate(r"\textbf{{RMSE={rmse:.2f}}}".format(rmse=rmse), xy=(0.05, 0.9), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))
# add the correlation coefficient to the plot
ax.annotate(r"\textbf{{r={r:.2f}}}".format(r=r), xy=(0.05, 0.8), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))

ax.set_xlabel("Actual contribution")
ax.set_ylabel("AI responsibility", color=aicol)
ax.set_ylim([0, 100])
ax.set_xlim([0, 100])

fig.tight_layout()
fig.savefig('figures/experiment1/reg_heur_ai.pdf', dpi=300)
plt.close()


In [42]:
# for each trial, compute mean and std of human and ai
trial_stats = reg_df.groupby('trial').agg({'human': ['mean', lambda x: stats.sem(x) * 1.96], 'ai': ['mean', lambda x: stats.sem(x) * 1.96], 'adv_heur_human': 'mean', 'adv_heur_ai': 'mean'})
trial_stats.columns = ['human_mean', 'human_ci', 'ai_mean', 'ai_ci', 'adv_heur_human_mean', 'adv_heur_ai_mean']
trial_stats.reset_index(inplace=True)

#### Human responsibility: actual contribution + difficulty

In [43]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.4)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_width))

# draw scatterplot showing the mean ai and human judgments
sns.scatterplot(data=trial_stats, x='adv_heur_human_mean', y='human_mean', color=humancol, ax=ax, s=50, alpha=0.5)
ax.errorbar(x=trial_stats['adv_heur_human_mean'], y=trial_stats['human_mean'], yerr=trial_stats['human_ci'], fmt='none', color=humancol, capsize=0, alpha=0.3)
sns.despine(ax=ax)

# draw a line from (0,0) to (100,100)
ax.plot([0, 100], [0, 100], color='#4c4c4c', linestyle='--')

# compute pearson correlation coefficient
r, p = stats.pearsonr(trial_stats['adv_heur_human_mean'], trial_stats['human_mean'])
# compute RMSE
rmse = np.sqrt(np.mean((trial_stats['adv_heur_human_mean'] - trial_stats['human_mean'])**2))
# add the RMSE to the plot
ax.annotate("RMSE={rmse:.2f}".format(rmse=rmse), xy=(0.05, 0.9), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))
# add the correlation coefficient to the plot
ax.annotate("r={r:.2f}".format(r=r), xy=(0.05, 0.8), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))

ax.set_xlabel("Actual contribution + difficulty")
ax.set_ylabel("Human responsibility", color=humancol)
ax.set_ylim([0, 100])
ax.set_xlim([0, 100])

fig.tight_layout()
fig.savefig('figures/experiment1/reg_adv_heur_human.pdf', dpi=300)
plt.close()


In [44]:
# for each trial, compute mean and std of human and ai
trial_stats = reg_df.groupby('trial').agg({'human': ['mean', lambda x: stats.sem(x) * 1.96], 'ai': ['mean', lambda x: stats.sem(x) * 1.96], 'cf_full_human': 'mean', 'cf_full_ai': 'mean'})
trial_stats.columns = ['human_mean', 'human_ci', 'ai_mean', 'ai_ci', 'cf_full_human_mean', 'cf_full_ai_mean']
trial_stats.reset_index(inplace=True)

#### Human responsibility: our model

In [45]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.4)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_width))

# draw scatterplot showing the mean ai and human judgments
sns.scatterplot(data=trial_stats, x='cf_full_human_mean', y='human_mean', color=humancol, ax=ax, s=50, alpha=0.5)
ax.errorbar(x=trial_stats['cf_full_human_mean'], y=trial_stats['human_mean'], yerr=trial_stats['human_ci'], fmt='none', color=humancol, capsize=0, alpha=0.3)
sns.despine(ax=ax)

# draw a line from (0,0) to (100,100)
ax.plot([0, 100], [0, 100], color='#4c4c4c', linestyle='--')

# compute pearson correlation coefficient
r, p = stats.pearsonr(trial_stats['cf_full_human_mean'], trial_stats['human_mean'])
# compute RMSE
rmse = np.sqrt(np.mean((trial_stats['cf_full_human_mean'] - trial_stats['human_mean'])**2))
# add the RMSE to the plot
ax.annotate(r"\textbf{{RMSE={rmse:.2f}}}".format(rmse=rmse), xy=(0.05, 0.9), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5), weight='bold')
# add the correlation coefficient to the plot
ax.annotate(r"\textbf{{r={r:.2f}}}".format(r=r), xy=(0.05, 0.8), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5), weight='bold')

ax.set_xlabel("Our model")
ax.set_ylabel("Human responsibility", color=humancol)
ax.set_ylim([0, 100])
ax.set_xlim([0, 100])

fig.tight_layout()
fig.savefig('figures/experiment1/reg_cf_full_human.pdf', dpi=300)
plt.close()


#### AI responsibility: our model

In [46]:
utils.latexify() # Computer Modern, with TeX
fig_width, fig_height = utils.get_fig_dim(width_pt, fraction=0.4)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(fig_width, fig_width))

# draw scatterplot showing the mean ai and human judgments
sns.scatterplot(data=trial_stats, x='cf_full_ai_mean', y='ai_mean', color=aicol, ax=ax, s=50, alpha=0.5)
ax.errorbar(x=trial_stats['cf_full_ai_mean'], y=trial_stats['ai_mean'], yerr=trial_stats['ai_ci'], fmt='none', color=aicol, capsize=0, alpha=0.3)
sns.despine(ax=ax)

# draw a line from (0,0) to (100,100)
ax.plot([0, 100], [0, 100], color='#4c4c4c', linestyle='--')

# compute pearson correlation coefficient
r, p = stats.pearsonr(trial_stats['cf_full_ai_mean'], trial_stats['ai_mean'])
# compute RMSE
rmse = np.sqrt(np.mean((trial_stats['cf_full_ai_mean'] - trial_stats['ai_mean'])**2))
# add the RMSE to the plot
ax.annotate("RMSE={rmse:.2f}".format(rmse=rmse), xy=(0.05, 0.9), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))
# add the correlation coefficient to the plot
ax.annotate("r={r:.2f}".format(r=r), xy=(0.05, 0.8), xycoords='axes fraction', fontsize=11,
                bbox=dict(boxstyle='round', fc='white', alpha=0.5))

ax.set_xlabel("Our model")
ax.set_ylabel("AI responsibility", color=aicol)
ax.set_ylim([0, 100])
ax.set_xlim([0, 100])

fig.tight_layout()
fig.savefig('figures/experiment1/reg_cf_full_ai.pdf', dpi=300)
plt.close()