**Copyright 2020 Google LLC.**

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

In [0]:
from __future__ import division
import pandas as pd
import numpy as np
import json
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

## Overview

### Pre-processes COMPAS dataset:

Download the COMPAS dataset from:
https://github.com/propublica/compas-analysis/blob/master/compas-scores-two-years.csv
and save it in the `./group_agnostic_fairness/data/compas` folder.

Input: ./group_agnostic_fairness/data/compas/compas-scores-two-years.csv

Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json

In [0]:
pd.options.display.float_format = '{:,.2f}'.format
dataset_base_dir = './group_agnostic_fairness/data/compas/'
dataset_file_name = 'compas-scores-two-years.csv'

### Processing original dataset

In [0]:
file_path = os.path.join(dataset_base_dir,dataset_file_name)
with open(file_path, "r") as file_name:
  temp_df = pd.read_csv(file_name)

# Columns of interest
columns = ['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count',
                'age', 
                'c_charge_degree', 
                'c_charge_desc',
                'age_cat',
                'sex', 'race',  'is_recid']
target_variable = 'is_recid'
target_value = 'Yes'

# Drop duplicates
temp_df = temp_df[['id']+columns].drop_duplicates()
df = temp_df[columns].copy()

# Convert columns of type ``object`` to ``category`` 
df = pd.concat([
        df.select_dtypes(include=[], exclude=['object']),
        df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex_axis(df.columns, axis=1)

# Binarize target_variable
df['is_recid'] = df.apply(lambda x: 'Yes' if x['is_recid']==1.0 else 'No', axis=1).astype('category')

# Process protected-column values
race_dict = {'African-American':'Black','Caucasian':'White'}
df['race'] = df.apply(lambda x: race_dict[x['race']] if x['race'] in race_dict.keys() else 'Other', axis=1).astype('category')

In [0]:
df.head()

Unnamed: 0,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age,c_charge_degree,c_charge_desc,age_cat,sex,race,is_recid
0,0,0,0,0,69,F,Aggravated Assault w/Firearm,Greater than 45,Male,Other,No
1,0,0,0,0,34,F,Felony Battery w/Prior Convict,25 - 45,Male,Black,Yes
2,0,0,1,4,24,F,Possession of Cocaine,Less than 25,Male,Black,Yes
3,0,1,0,1,23,F,Possession of Cannabis,Less than 25,Male,Black,No
4,0,0,0,2,43,F,arrest case no charge,25 - 45,Male,Other,No


### Shuffle and Split into Train (70%) and Test set (30%)

In [0]:
train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)

output_file_path = os.path.join(dataset_base_dir,'train.csv')
with open(output_file_path, mode="w") as output_file:
    train_df.to_csv(output_file,index=False,columns=columns,header=False)
    output_file.close()

output_file_path = os.path.join(dataset_base_dir,'test.csv')
with open(output_file_path, mode="w") as output_file:
    test_df.to_csv(output_file,index=False,columns=columns,header=False)
    output_file.close()

### Computing Invese propensity weights for each subgroup, and writes to directory.

IPS_example_weights_with_label.json: json dictionary of the format
        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach.

In [0]:
IPS_example_weights_without_label = {
  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male
  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female
  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male
  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female
}
  
output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_without_label))
    output_file.close()

print(IPS_example_weights_without_label)

{0: 2.595886889460154, 1: 9.709615384615384, 2: 2.3974358974358974, 3: 10.56276150627615}


In [0]:
IPS_example_weights_with_label = {
0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male
1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female
2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male
3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female
4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male
5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female
6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male
7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female
}
  
output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_with_label))
    output_file.close()

print(IPS_example_weights_with_label)

{0: 4.598360655737705, 1: 15.162162162162161, 2: 5.744027303754266, 3: 18.03214285714286, 4: 5.961038961038961, 5: 27.0, 6: 4.114914425427873, 7: 25.5}


### Construct vocabulary.json, and write to directory.

vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features.

In [0]:
cat_cols = train_df.select_dtypes(include='category').columns
vocab_dict = {}
for col in cat_cols:
  vocab_dict[col] = list(set(train_df[col].cat.categories))
  
output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(vocab_dict))
    output_file.close()
print(vocab_dict)

{'c_charge_degree': ['M', 'F'], 'c_charge_desc': ['Assault', 'DUI/Property Damage/Persnl Inj', 'Poss of Firearm by Convic Felo', 'Cash Item w/Intent to Defraud', 'Trespass Struct/Conveyance', 'Agg Fleeing/Eluding High Speed', 'Poss Counterfeit Payment Inst', 'Del Morphine at/near Park', 'Poss of Vessel w/Altered ID NO', 'Opert With Susp DL 2nd Offens', 'Compulsory Attendance Violation', 'Depriv LEO of Protect/Communic', 'Carrying Concealed Firearm', 'Aggravated Assault w/Firearm', 'Possession Burglary Tools', 'Possession Of Alprazolam', 'Flee/Elude LEO-Agg Flee Unsafe', 'Aggravated Assault', 'Harm Public Servant Or Family', 'Battery', 'Money Launder 100K or More Dols', 'Tampering with a Victim', 'Unauth Poss ID Card or DL', 'Manslaughter W/Weapon/Firearm', 'Sell or Offer for Sale Counterfeit Goods', 'Fail Register Career Offender', 'Felony Petit Theft', 'Possession of Morphine', 'Burglary Unoccupied Dwelling', 'Burglary Dwelling Occupied', 'Simulation of Legal Process', 'Resist Officer

### Construct mean_std.json, and write to directory

mean_std.json: json dictionary of the format feature_name: [mean, std]},
containing mean and std for numerical features. 

In [0]:
temp_dict = train_df.describe().to_dict()
mean_std_dict = {}
for key, value in temp_dict.items():
  mean_std_dict[key] = [value['mean'],value['std']]

output_file_path = os.path.join(dataset_base_dir,'mean_std.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(mean_std_dict))
    output_file.close()
print(mean_std_dict)

{'juv_fel_count': [0.06971677559912855, 0.5014755752507987], 'juv_misd_count': [0.09368191721132897, 0.5172707491859172], 'juv_other_count': [0.10556545850663497, 0.4686557323312097], 'priors_count': [3.5094078035254506, 4.951584194970699], 'age': [34.93761140819964, 11.925808583868745]}
