# Process Data

## Load data

We use the COMPAS dataset here, which is a subset of all the data files.

In [None]:
# One line per import
import pandas as pd
import numpy as np

In [None]:
datadir = # TODO PATH TO DATA FROM https://github.com/stanford-policylab/recidivism-predictions

broward_clean_df = pd.read_csv(f'{datadir}/individuals/broward_clean_fixed.csv')
compas_scores_df = pd.read_csv(f'{datadir}/individuals/compas_scores.csv')
compas_vignettes_df = pd.read_csv(f'{datadir}/individuals/compas_vignettes.csv')

response_df = pd.read_csv(f'{datadir}/surveys/df_response.csv')
user_df = pd.read_csv(f'{datadir}/surveys/df_user.csv')

## Clean Data

Note that "users" correspond to Mechanical Turk participants, and "individuals" correspond to the defendants

In [None]:
# Remove "dummy" individuals
### The top 2 "individuals" are dummy indiciators, so we remove them
### response_df[['individual_id']].value_counts()

bad_ids = ['dummy1', 'dummy2']
response_df = response_df.query('individual_id not in @bad_ids')

# Fix Datatypes
response_df = response_df.astype({'individual_id': int})
broward_clean_df = broward_clean_df.astype({'id': int})

In [None]:
broward_clean_df.loc[broward_clean_df['id'].isin(response_df.individual_id.values)]['two_year_recid'].mean()

## Filter to cases of interest

We are only interested in the cases where the user group is `vignette` (indicating that no feedback was given) as opposed to `feedback_vignette`, where feedback is given after each prediction

In [None]:
# response_df.user_group.value_counts()
response_df = response_df.query("user_group != 'feedback_vignette'")
response_df = response_df.drop('user_group', axis = 1)

We also want to filter out any users that did not complete the exercise, or who recieved feedback during the process

In [None]:
user_df = user_df.query("user_group != 'feedback_vignette' & exit_status == 'submitted'")
user_df.drop(['user_group', 'exit_status'], axis = 1)

There are three duplicate users (who completed 2 assignments each), so we confirm that their information is stable across assignments, and remove the duplicates

In [None]:
# Names of the features we wish to keep: Clarify these are for the USERS not the Defendants
user_feat_remap = {
    'user_id': 'user_id', 
    'age': 'user_age',
    'gender': 'user_gender',
    'degree': 'user_degree'
}

user_features = [v for k, v in user_feat_remap.items()]

# Rename and drop features we do not need
user_df = user_df.rename(columns = user_feat_remap)
user_df = user_df[user_features]

# Remove duplicate users
user_df = user_df.drop_duplicates()

## Merge Features and Predictions

Here we use the features from the `broward_clean_df`, which are the same as those in the compas dataset

In [None]:
# COMPAS Risk Factors
broward_features = [
    # Defendant id and demographics
    'id', 'race', 'sex', 'age', 
    # Criminal history
    'juv_fel_count', 'juv_misd_count', 'priors_count', 
    # Charge identifier and degree
    'charge_id', 'charge_degree (misd/fel)'
]

# Features of user response
response_features = [
    'user_id', 'individual_id', 'predicted_decision', 'leave_time', 'enter_time'
]
response_df = response_df[response_features]

# Merge risk factors ("features") with the user responses
response_with_features_df = response_df.merge(
    broward_clean_df[broward_features],
    left_on='individual_id', right_on='id')

# Drop the extraneous "id" variable
response_with_features_df = response_with_features_df.drop('id', axis=1)

## Merge User information

We also pull in the user information, for down-stream analysis.  Note that because we restrict to users that have `exit_status == submitted`, this results in a small reduction in the number of samples

In [None]:
response_with_features_df = response_with_features_df.merge(user_df, on='user_id')

# Construct Features

Our framework is designed to work with binary decisions, rather than predicted probabilities, so we threshold the probabilities provided by Mechanical Turk participants into binary decisions

In [None]:
response_with_features_df['outcome'] = np.where(response_with_features_df['predicted_decision'] > 50, 1, 0)
del response_with_features_df['predicted_decision']

We also construct a feature for the time spent in the session

In [None]:
response_with_features_df['time_deciding'] = \
    response_with_features_df['leave_time'] - response_with_features_df['enter_time']

del response_with_features_df['leave_time']
del response_with_features_df['enter_time']

Then we construct one-hot encodings of categorical features

In [None]:
# We derived the race mapping from the following
# Get an example of each racial category
race_categories_df = response_with_features_df[['individual_id','race']].drop_duplicates(subset='race')

# Merge with the compas scores to get a lookup
race_categories_df \
    = race_categories_df.merge(compas_scores_df[['id','race']], left_on='individual_id', right_on='id')
race_x = race_categories_df.race_x.values
race_y = race_categories_df.race_y.values
race_dict = dict()
for i in range(len(race_x)):
    race_dict[f'race_{race_x[i]}'] = f"race_{race_y[i].lower().replace(' ', '_')}"

In [None]:
# Get dummies for the following - we handle charge id below manually
cat_feats = ['user_gender', 'user_degree', 'race'] #, 'charge_id']

In [None]:
response_with_features_df = pd.get_dummies(response_with_features_df, columns = cat_feats)

In [None]:
rename_dict = {
    'user_gender_f': 'user_gender_female',
    'user_gender_m': 'user_gender_male',
    'user_gender_o': 'user_gender_other',
    'user_degree_Associate degree': 'user_degree_associate',
    'user_degree_Bachelor degree': 'user_degree_bachelor',
    'user_degree_Master degree': 'user_degree_master_degree',
    'user_degree_Doctoral degree': 'user_degree_doctoral',
    'user_degree_High school': 'user_degree_high_school',
    'user_degree_Middle school': 'user_degree_middle_school',
}

In [None]:
rename_dict.update(race_dict)

In [None]:
response_with_features_df.rename(columns = rename_dict, inplace=True)

In [None]:
# First, we get an example of each charge ID
charge_categories_df \
    = response_with_features_df[['individual_id','charge_id']].drop_duplicates(subset='charge_id')

# Then, we map this to names in the compas vignettes
charge_categories_df = charge_categories_df.merge(compas_vignettes_df[['id','charge_name']], \
                                                  left_on='individual_id', right_on='id')
charge_ids = charge_categories_df.charge_id.values
charge_names = charge_categories_df.charge_name.values

# We then construct a dictionary to lookup charge names, and reformat them
charge_dict = dict()
for i in range(len(charge_ids)):
    charge_dict[charge_ids[i]] = charge_names[i]
charge_dict_rev = dict()
for i in charge_dict:
    charge_col_name = 'charge_' + charge_dict[i].lower().replace(' ', '_')
    if charge_col_name in charge_dict_rev:
        charge_dict_rev[charge_col_name].add(i)
    else:
        charge_dict_rev[charge_col_name] = {i}
        
# Finally, we construct one-hot encodings manually, based on these names
for charge in charge_dict_rev:
    response_with_features_df[charge] \
        = np.where(response_with_features_df['charge_id'].isin(charge_dict_rev[charge]), 1, 0)
del response_with_features_df['charge_id']

In [None]:
response_with_features_df.to_csv(f'{datadir}/compas_no_feedback_data.csv', index=False)