In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

INFILE = '../data/adoption.csv'

In [2]:
df = pd.read_csv(INFILE)
df.head()

Unnamed: 0,Test,WorkerId,Explanation,Adopt,WTP,OtherBid,WonAuction,SurveyCode,ID,EndTime,...,juv_fel_count,juv_misd_count,juv_other_count,priors_count,felony,black,married,output,y,FcastTimer
0,1.0,test,0.0,1.0,19.69,0.0,1.0,x0vRDP,1,2020-11-25 17:08:29.712628,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.271688,0.0,3.264869
1,1.0,test,0.0,1.0,19.69,0.0,1.0,x0vRDP,1,2020-11-25 17:08:29.712628,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.246545,0.0,4.695119
2,1.0,test,0.0,1.0,19.69,0.0,1.0,x0vRDP,1,2020-11-25 17:08:29.712628,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.631654,0.0,3.275935
3,1.0,test,0.0,1.0,19.69,0.0,1.0,x0vRDP,1,2020-11-25 17:08:29.712628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226197,0.0,2.00072
4,1.0,test,1.0,1.0,20.83,19.69,1.0,O8L9sI,2,2020-11-25 17:09:49.355854,...,0.0,0.0,0.0,11.0,1.0,1.0,0.0,0.816362,1.0,3.29928


In [3]:
# select participants who passed comprehension checks
df = df[(df.WorkerId!='test')]
print('N total participants', len(pd.unique(df.ID)))
# remove participants who failed comprehension checks
df = df[(df.FcastComprehension==1) & (df.BonusComprehension==1)]
print('N passed comprehension check', len(pd.unique(df.ID)))
# remove participants who bid more than $10 (the high bid for the no-adoption condition)
df = df[df.WTP < 29.99]
print('N auction results matched adoption condition', len(pd.unique(df.ID)))
# select 'actual' forecasts (as opposed to practice forecasts)
df = df[df.Practice==0]
# scale forecasts to be between 0 and 1 (as opposed to 0 and 100)
df['Fcast'] = df['Fcast'] / 100.
# compute score and interaction variable
df['Exp_x_Adopt'] = df.Explanation * df.Adopt
df['Exp_x_black'] = df.Explanation * df.black
df['Score'] = (df.y - df.Fcast)**2 - (df.y - df.output)**2
df.values.shape

N total participants 34
N passed comprehension check 32
N auction results matched adoption condition 32


(64, 56)

In [4]:
# remove empty values from the score column
df = df.dropna(subset=['Score'])
df.values.shape

(64, 56)

In [5]:
# effect of explanation on performance if everyone adopted the model
# negative coef on interaction indicates the explanation improved performance
# positive coef on interaction indicates the explanation harmed performance
adopt_df = df[df.Adopt==1]
X = sm.add_constant(adopt_df.Explanation)
reg = sm.OLS(adopt_df.Score, X)
res = reg.fit().get_robustcov_results(cov_type='cluster', groups=adopt_df.ID)
res.summary()

0,1,2,3
Dep. Variable:,Score,R-squared:,0.048
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,1.982
Date:,"Fri, 27 Nov 2020",Prob (F-statistic):,0.178
Time:,09:34:07,Log-Likelihood:,-16.776
No. Observations:,34,AIC:,37.55
Df Residuals:,32,BIC:,40.6
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1989,0.072,2.761,0.014,0.046,0.352
Explanation,-0.1779,0.126,-1.408,0.178,-0.446,0.090

0,1,2,3
Omnibus:,0.577,Durbin-Watson:,2.275
Prob(Omnibus):,0.749,Jarque-Bera (JB):,0.487
Skew:,-0.273,Prob(JB):,0.784
Kurtosis:,2.786,Cond. No.,2.55


In [6]:
# effect of explanation on fairness if everyone adopted the model
# negative coef on interaction indicates the explanation improved fairness
# positive coef on interaction indicates the explanation harmed fairness

# select observations where the offender did not recidivate and the model was adopted
no_recid_df = adopt_df[adopt_df.y == 0]
X = sm.add_constant(no_recid_df[['Explanation', 'black', 'Exp_x_black']])
reg = sm.OLS(no_recid_df.Score, X)
res = reg.fit().get_robustcov_results(cov_type='cluster', groups=no_recid_df.ID)
res.summary()

0,1,2,3
Dep. Variable:,Score,R-squared:,0.036
Model:,OLS,Adj. R-squared:,-0.144
Method:,Least Squares,F-statistic:,0.3572
Date:,"Fri, 27 Nov 2020",Prob (F-statistic):,0.785
Time:,09:34:07,Log-Likelihood:,-9.1514
No. Observations:,20,AIC:,26.3
Df Residuals:,16,BIC:,30.29
Df Model:,3,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2257,0.145,1.561,0.141,-0.084,0.536
Explanation,-0.1696,0.202,-0.839,0.415,-0.603,0.264
black,0.0039,0.260,0.015,0.988,-0.553,0.561
Exp_x_black,0.1029,0.367,0.281,0.783,-0.684,0.889

0,1,2,3
Omnibus:,0.211,Durbin-Watson:,2.139
Prob(Omnibus):,0.9,Jarque-Bera (JB):,0.41
Skew:,-0.088,Prob(JB):,0.815
Kurtosis:,2.321,Cond. No.,6.6


In [7]:
def simulate_market(df, price):
    # indicates the participant would have adopted the model in this simulation
    df = df[df.Adopt == (df.WTP > price)]
    compute_performance_effect(df, price)
    compute_fairness_effect(df, price)
    compute_adoption_effect(df, price)
    decompose(df, price)
    
def compute_performance_effect(df, price):
    X = sm.add_constant(df.Explanation)
    reg = sm.OLS(df.Score, X)
    res = reg.fit().get_robustcov_results(cov_type='cluster', groups=df.ID)
    print('\nNegative coefficient on explanation indicates that explanation improved performance')
    print('Positive coefficient on explanation indicates that explanation harmed performance\n')
    print(res.summary())
    
def compute_fairness_effect(df, price):
    df = df[df.y == 0]
    df['Exp_x_black'] = df.Explanation * df.black
    X = sm.add_constant(df[['Explanation', 'black', 'Exp_x_black']])
    reg = sm.OLS(df.Score, X)
    res = reg.fit().get_robustcov_results(cov_type='cluster', groups=df.ID)
    print('\nNegative coefficient on interaction indicates that explanation improved fairness')
    print('Positive coefficient on interaction indicates that explanation harmed fairness\n')
    print(res.summary())
    
def compute_adoption_effect(df, price):
    df = df.drop_duplicates(subset=['ID'])
    X = sm.add_constant(df.Explanation)
    reg = sm.OLS(df.Adopt, X)
    res = reg.fit().get_robustcov_results(cov_type='HC0')
    print('\nPositive coefficient on explanation indicates the explanation increased adoption')
    print('Negative coefficient on explanation indicates that explanation decreased adoption\n')
    print(res.summary())
    
def decompose(df, price):
    X = sm.add_constant(df[['Explanation', 'Adopt', 'Exp_x_Adopt']])
    reg = sm.OLS(df.Score, X)
    res = reg.fit().get_robustcov_results(cov_type='cluster', groups=df.ID)
    print(res.summary())
    
median_bid = df.drop_duplicates(subset='ID').WTP.median()
simulate_market(df, median_bid)


Negative coefficient on explanation indicates that explanation improved performance
Positive coefficient on explanation indicates that explanation harmed performance

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.031
Method:                 Least Squares   F-statistic:                  0.004813
Date:                Fri, 27 Nov 2020   Prob (F-statistic):              0.946
Time:                        09:34:07   Log-Likelihood:                -15.769
No. Observations:                  34   AIC:                             35.54
Df Residuals:                      32   BIC:                             38.59
Df Model:                           1                                         
Covariance Type:              cluster                                         
                  coef    std err         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "anyway, n=%i" % int(n))
  "anyway, n=%i" % int(n))


In [8]:
df.groupby(['Explanation', 'y']).Fcast.mean()

Explanation  y  
0.0          0.0    0.501429
             1.0    0.483333
1.0          0.0    0.378333
             1.0    0.585714
Name: Fcast, dtype: float64

In [9]:
df.groupby(['Explanation', 'black', 'y']).Fcast.mean()

Explanation  black  y  
0.0          0.0    0.0    0.525714
                    1.0    0.626667
             1.0    0.0    0.477143
                    1.0    0.411667
1.0          0.0    0.0    0.338000
                    1.0    0.647143
             1.0    0.0    0.428750
                    1.0    0.524286
Name: Fcast, dtype: float64

In [10]:
df.groupby(['black', 'y']).output.mean()

black  y  
0.0    0.0    0.327136
       1.0    0.619252
1.0    0.0    0.347933
       1.0    0.634622
Name: output, dtype: float64