In [1]:
data_dir = "test_data"

In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
from sklearn.metrics import mean_squared_error

# Set the font family and size for LaTeX rendering
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.size"] = 10

plt.rcParams.update({"text.usetex": False})

# Set the font used for math expressions to LaTeX
plt.rcParams["mathtext.fontset"] = "cm"

In [3]:
results_full = pd.read_csv(os.path.join('file_paths', data_dir + '_full_model_file_paths.csv'))
results_1 = pd.read_csv(os.path.join('file_paths', data_dir + '_no_disparity1_file_paths.csv'))
results_2 = pd.read_csv(os.path.join('file_paths', data_dir + '_no_disparity2_file_paths.csv'))
results_3 = pd.read_csv(os.path.join('file_paths', data_dir + '_no_disparity3_file_paths.csv'))

results_list = [results_full, results_1, results_2, results_3]

simulated_datas_full = {}
simulated_datas_1 = {}
simulated_datas_2 = {}
simulated_datas_3 = {}
simulated_datas_list = [simulated_datas_full, simulated_datas_1, simulated_datas_2, simulated_datas_3]

labels_list = ['Full model', 'No disparity 1', 'No disparity 2', 'No disparity 3']

In [4]:
for results, simulated_datas in zip(results_list, simulated_datas_list):
    for file_name in results['data_file'].tolist():
        file = open(file_name, 'rb')
        simulated_datas[file_name] = pickle.load(file)
        file.close()

In [None]:
fig, axes = plt.subplots(1, 4)

print('Model\t\t# runs\tA=0 error\tA=1 error\tA=0 Corr.\tA=1 Corr.')

for results, simulated_datas, label, plt_i in zip(results_list, simulated_datas_list, labels_list, range(4)):
    advantaged_true_mean_z_vals = []
    disadvantaged_true_mean_z_vals = []
    advantaged_posterior_mean_z_vals = []
    disadvantaged_posterior_mean_z_vals = []
    total_num = 0
    
    for trial_i in range(results.shape[0]):
        file = results.iloc[trial_i]['stan_sample_file']
        if not os.path.isfile(file):
            continue
        print(trial_i)
        total_num += 1

        latent_params = simulated_datas[results.iloc[trial_i]['data_file']]['latent_params']
        observed_data = simulated_datas[results.iloc[trial_i]['data_file']]['observed_data']

        file = results.iloc[trial_i]['stan_sample_file']
        df = pd.read_csv(file)
        true_z_vals = np.array(latent_params['z'])
        posterior_z_vals = np.array(df[df['Unnamed: 0'].str.contains("z\[")]['mean'].reset_index(drop=True))

        i_with_a0 = np.where(observed_data['a_per_visit'][:,0])[0]
        i_with_a1 = np.where(observed_data['a_per_visit'][:,1])[0]

        a0_posterior_mean = posterior_z_vals[i_with_a0]
        a1_posterior_mean = posterior_z_vals[i_with_a1]
        a0_true = true_z_vals[i_with_a0]
        a1_true = true_z_vals[i_with_a1]

        a0_adv = False
        if label == 'Full model' and (latent_params['mu_r'][1] > latent_params['mu_r'][0]): 
            a0_adv = True
        if label == 'No disparity 1' and (latent_params['mu_z0'][1] > latent_params['mu_z0'][0]):
            a0_adv = True
        if label == 'No disparity 2' and (latent_params['mu_r'][1] > latent_params['mu_r'][0]):
            a0_adv = True
        if label == 'No disparity 3' and (latent_params['beta_a'] < 0):
            a0_adv = True

        
        if a0_adv:
            advantaged_true_mean_z_vals.append(a0_true.mean())
            disadvantaged_true_mean_z_vals.append(a1_true.mean())
            advantaged_posterior_mean_z_vals.append(a0_posterior_mean.mean())
            disadvantaged_posterior_mean_z_vals.append(a1_posterior_mean.mean())
        else:
            advantaged_true_mean_z_vals.append(a1_true.mean())
            disadvantaged_true_mean_z_vals.append(a0_true.mean())
            advantaged_posterior_mean_z_vals.append(a1_posterior_mean.mean())
            disadvantaged_posterior_mean_z_vals.append(a0_posterior_mean.mean())

    axes[plt_i].scatter(advantaged_true_mean_z_vals, advantaged_posterior_mean_z_vals, s=3, color='blue')
    axes[plt_i].scatter(disadvantaged_true_mean_z_vals, disadvantaged_posterior_mean_z_vals, s=3, color='red')        
    
    axes[plt_i].set_title(label, fontsize=10)

    axes[plt_i].axline((0,0), slope=1, color='black', linestyle='--')
    axes[plt_i].axis('square')

    ### DISPLAY RESULTS ###
    adv_pearson_r = scipy.stats.pearsonr(advantaged_true_mean_z_vals, advantaged_posterior_mean_z_vals)
    disadv_pearson_r = scipy.stats.pearsonr(disadvantaged_true_mean_z_vals, disadvantaged_posterior_mean_z_vals)

    adv_rmse = mean_squared_error(advantaged_true_mean_z_vals, advantaged_posterior_mean_z_vals, squared=False)
    disadv_rmse = mean_squared_error(disadvantaged_true_mean_z_vals, disadvantaged_posterior_mean_z_vals, squared=False)

    adv_diff = np.mean(np.array(advantaged_posterior_mean_z_vals) - np.array(advantaged_true_mean_z_vals))
    disadv_diff = np.mean(np.array(disadvantaged_posterior_mean_z_vals) - np.array(disadvantaged_true_mean_z_vals))
    
    print(label, '\t', total_num, '\t', round(adv_diff, 2), '\t\t', round(disadv_diff, 2), 
              round(adv_pearson_r.statistic, 2), '\t\t', round(disadv_pearson_r.statistic, 2))

fig.text(0.5, 0.3, "Mean true $Z_t$", ha='center', fontsize=10)
fig.text(0, 0.37, "Mean recovered $Z_t$", ha='center', rotation='vertical', fontsize=10)
    
plt.tight_layout()
plt.savefig(os.path.join('figures', 'biased_recovery.pdf'), bbox_inches='tight')
plt.show()