In [1]:
import numpy as np
import pandas as pd

import random

random.seed(0)
np.random.seed(0)

INFILE = '../data/comparison.csv'

In [2]:
df = pd.read_csv(INFILE)
df.head()

Unnamed: 0,ParticipantID,CompletionCode,ID,EndTime,StartTime,Status,DemographicsTime,Gender,GenderSpecify,AgeBins,...,juv_other_count,priors_count,felony,black,married,output,y,FcastTimer,MostImportantFeature,next
0,test,yxI0jg,1,2020-11-19 23:43:51.969826,2020-11-19 23:32:59.155429,Completed,42.387854,Male,,25-29,...,1.0,1.0,0.0,1.0,0.0,0.766923,1.0,29.173377,,
1,test,yxI0jg,1,2020-11-19 23:43:51.969826,2020-11-19 23:32:59.155429,Completed,42.387854,Male,,25-29,...,0.0,0.0,0.0,0.0,0.0,0.08892,0.0,145.471718,,
2,test,yxI0jg,1,2020-11-19 23:43:51.969826,2020-11-19 23:32:59.155429,Completed,42.387854,Male,,25-29,...,0.0,15.0,0.0,1.0,0.0,0.736685,1.0,8.672931,,
3,test,yxI0jg,1,2020-11-19 23:43:51.969826,2020-11-19 23:32:59.155429,Completed,42.387854,Male,,25-29,...,0.0,1.0,1.0,1.0,0.0,0.443584,0.0,10.088882,,
4,test,yxI0jg,1,2020-11-19 23:43:51.969826,2020-11-19 23:32:59.155429,Completed,42.387854,Male,,25-29,...,0.0,2.0,1.0,0.0,0.0,0.363658,0.0,17.196649,,


In [3]:
# select participants who passed comprehension checks
df = df[(df.ParticipantID!='test')]
print('N total participants', len(pd.unique(df.ParticipantID)))
df = df[(df.FcastComprehension==1) & (df.BonusComprehension==1)]
print('N passed comprehension check', len(pd.unique(df.ParticipantID)))
# select 'actual' forecasts (as opposed to practice forecasts)
df = df[df.Practice==0]
# scale forecasts to be between 0 and 1 (as opposed to 0 and 100)
df['Fcast'] = df['Fcast'] / 100.
df = df.dropna(subset=['Fcast'])
df.values.shape

N total participants 100
N passed comprehension check 78


(762, 49)

In [15]:
X = df[[
    'priors_count', 
    'age', 
    'felony', 
    'black', 
    'male', 
    'juv_fel_count', 
    'juv_misd_count', 
    'juv_other_count', 
    'married'
]]
y = df.Fcast
X.head()

Unnamed: 0,priors_count,age,felony,black,male,juv_fel_count,juv_misd_count,juv_other_count,married
35,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
36,4.0,24.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
37,0.0,34.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
38,2.0,29.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
39,0.0,34.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [5]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from scipy.stats import expon, uniform

X['priors_age'] = X.priors_count * X.age
X['priors_felony'] = X.priors_count * X.felony
X['priors_male'] = X.priors_count * X.male
X['priors_black'] = X.priors_count * X.black
X['black_male'] = X.black * X.male

parameters = {
    'alpha': expon(0, 1),
    'l1_ratio': uniform(0, 1)
}

reg = RandomizedSearchCV(ElasticNet(), parameters, n_iter=2**9).fit(X, y)
res = cross_validate(reg.best_estimator_, X, y)
print(res)
res['test_score'].mean()

{'fit_time': array([0.00410986, 0.00315881, 0.003721  , 0.0041101 , 0.00370073]), 'score_time': array([0.00175571, 0.0016799 , 0.00170064, 0.00175333, 0.00167537]), 'test_score': array([0.20902595, 0.47458075, 0.19731418, 0.32443628, 0.36155486])}


0.31338240438208287

In [6]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from scipy.stats import expon, uniform

parameters = {
    'eta': uniform(0, 1),
    'gamma': expon(0, 1),
    'max_depth': list(range(1, 6)),
    'min_child_weight': expon(0, 1),
    'alpha': expon(0, 1)
}
reg = RandomizedSearchCV(xgb.XGBRegressor(), parameters, n_iter=2**9).fit(X, y)
res = cross_validate(reg.best_estimator_, X, y.values)
print(res)
res['test_score'].mean()

{'fit_time': array([0.02320218, 0.01919007, 0.0323205 , 0.01980782, 0.02047658]), 'score_time': array([0.00219393, 0.0026083 , 0.00254631, 0.00195956, 0.00209689]), 'test_score': array([0.27049232, 0.59179264, 0.31106243, 0.36666228, 0.44488822])}


0.396979575478253

In [13]:
import gshap
from gshap.datasets import load_recidivism
from gshap.intergroup import IntergroupDifference

columns = X.columns
recidivism = load_recidivism()
X, y = recidivism.data[columns], recidivism.target
# X = X.drop(columns='high_supervision')
y_black = y[X['black']==1]
y_white = y[X['black']==0]

KeyError: "['high_supervision'] not found in axis"

In [14]:
def fp_diff(output_white, output_black):
    output_white = output_white[y_white==0]
    output_black = output_black[y_black==0]
    return output_black.mean() / output_white.mean() - 1

g = IntergroupDifference(group=X['black'], distance=fp_diff)
explainer = gshap.KernelExplainer(reg.predict, X, g)
gshap_values = explainer.gshap_values(X, nsamples=32)
df = pd.DataFrame({'Variables': X.columns, 'Importance': 100*gshap_values})
df = df.sort_values('Importance', ascending=False)
fig = sns.barplot(y='Variables', x='Importance', data=df)
# fig.figure.savefig('figures/original_model_gshap.png', bbox_inches = "tight")
print(gshap_values, gshap_values.sum())

ValueError: feature_names mismatch: ['priors_count', 'age', 'felony', 'black', 'male', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'married', 'priors_age', 'priors_felony', 'priors_male', 'priors_black', 'black_male'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8']
expected juv_misd_count, juv_fel_count, priors_black, priors_male, male, black_male, juv_other_count, priors_count, felony, priors_age, priors_felony, married, black, age in input data
training data did not have the following fields: f8, f4, f7, f1, f2, f6, f3, f5, f0