In [2]:
import pandas as pd

In [50]:
def add_jail_length(compas_df):
    ######### Adding Jail Stay Length feature using c_jail_in and c_jail_out features ########
    compas_df['c_jail_out'] = pd.to_datetime(compas_df['c_jail_out'])
    compas_df['c_jail_in'] = pd.to_datetime(compas_df['c_jail_in'])

    compas_df['jail_stay_length'] = (compas_df['c_jail_out'] - compas_df['c_jail_in']).dt.days 
    compas_df['jail_stay_length'] = compas_df['jail_stay_length'].apply(lambda x: 0 if x < 0 else x)
    compas_df.drop(columns=['c_jail_in', 'c_jail_out'], axis=1, inplace=True)
    
    ### Filling in missing values for jail lengths
    compas_df['jail_stay_length'].fillna(compas_df.groupby('c_charge_desc')['jail_stay_length'].transform('median'), inplace=True)
    compas_df['jail_stay_length'].fillna(compas_df['jail_stay_length'].mean(), inplace=True)
    return compas_df
    


def preprocess_compas_data(filepath, 
                    features_to_keep,
                    categorical_features,
                    protected_attribute_names=['race', 'sex'],
                    unprivileged_classes=['African-American', 'Male']
                    ):
    
    compas_df = pd.read_csv(filepath)
     ### dropping irrelevant columns
    compas_df = compas_df[features_to_keep]
    compas_df.dropna(inplace=True)
    
    ### Limit data to only two races
    compas_df = compas_df[compas_df.race.isin(['African-American', 'Caucasian'])]
    
    ### Replace c_jail_in and c_jail_out with jail_length columns
    compas_df = add_jail_length(compas_df)
  
    ### protect columns
    for attribute, unpriveleged_class in zip(protected_attribute_names, unprivileged_classes):
        compas_df[attribute + "_" + unpriveleged_class] = (compas_df[attribute] == unpriveleged_class)*1
        
    ### one-hot encoding of categorical features 
    for feature in categorical_features:
        compas_df = pd.concat([compas_df, pd.get_dummies(compas_df[feature], prefix=feature)], axis=1)
     
    #### limiting charge description encodings
    value_counts = compas_df.c_charge_desc.value_counts() 
    common_charges = value_counts[value_counts>1].index.tolist()
    for charge_desc in common_charges:
        compas_df["c_charge_desc_" + charge_desc] = (compas_df['c_charge_desc'] == charge_desc)*1
    
    ### dropping original categorical columns
    compas_df.drop(columns=['sex','race', 'c_charge_desc']+categorical_features, axis=1, inplace=True)
    
       
    return compas_df

src_filepath = '../data/Compas Dataset/raw_compas.csv'
dest_filepath = '../data/Compas Dataset/processed_compas.csv'
features_to_keep = ['race', 'sex', 'age_cat', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 
                    'priors_count', 'c_jail_in', 'c_jail_out', 'c_charge_degree', 'c_charge_desc', 
                    'is_recid', 'is_violent_recid']
categorical_features = ['age_cat', 'c_charge_degree']
compas_df = preprocess_compas_data(filepath, features_to_keep, categorical_features)
compas_df.to_csv(dest_filepath, index=False)
compas_df.head()

Unnamed: 0,juv_fel_count,juv_misd_count,juv_other_count,priors_count,is_recid,is_violent_recid,jail_stay_length,race_African-American,sex_Male,age_cat_25 - 45,...,c_charge_desc_Armed Kidnapping,c_charge_desc_Exhibition Weapon School Prop,c_charge_desc_Harm Public Servant Or Family,c_charge_desc_Leave Accd/Attend Veh/Less $50,c_charge_desc_Tresspass in Structure or Conveyance,c_charge_desc_Fighting/Baiting Animals,c_charge_desc_Trespass Other Struct/Conve,c_charge_desc_Poss of Firearm/Ammun/Dom Viol,c_charge_desc_Trespass,c_charge_desc_Misuse Of 911 Or E911 System
2,0,0,0,0,1,1,10,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,4,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,14,1,0,6,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10,0,0,1,0,0,0,33,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
compas_df.columns[:20]

Index(['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count',
       'is_recid', 'is_violent_recid', 'jail_stay_length',
       'race_African-American', 'sex_Male', 'age_cat_25 - 45',
       'age_cat_Greater than 45', 'age_cat_Less than 25',
       'c_charge_degree_(CO3)', 'c_charge_degree_(F1)', 'c_charge_degree_(F2)',
       'c_charge_degree_(F3)', 'c_charge_degree_(F5)', 'c_charge_degree_(F6)',
       'c_charge_degree_(F7)', 'c_charge_degree_(M1)'],
      dtype='object')