In [215]:
from math import ceil
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

Assumptions made 
1. Use recidivism in favor of violent_recidivism 
2. Drop violent recidivismm statistics in prediction and training

In [216]:
import os
filename = "compas-scores.csv"
assert(filename in os.listdir())

original_columns =  ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'num_r_cases',
       'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date']

date_cols = ["c_jail_in", "c_jail_out", "r_jail_in", "r_jail_out", "r_offense_date", "compas_screening_date"]

csv = pd.read_csv(filename, usecols=original_columns, parse_dates=date_cols)
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 46 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       11757 non-null  int64         
 1   name                     11757 non-null  object        
 2   first                    11757 non-null  object        
 3   last                     11757 non-null  object        
 4   compas_screening_date    11757 non-null  datetime64[ns]
 5   sex                      11757 non-null  object        
 6   dob                      11757 non-null  object        
 7   age                      11757 non-null  int64         
 8   age_cat                  11757 non-null  object        
 9   race                     11757 non-null  object        
 10  juv_fel_count            11757 non-null  int64         
 11  decile_score             11757 non-null  int64         
 12  juv_misd_count           11757 n

In [217]:
np.random.seed(40)

csv["current_jailtime"] = (csv["c_jail_out"] - csv["c_jail_in"]).dt.total_seconds().div(86400).fillna(0).apply(lambda x: ceil(x))
csv["previous_jailtime"] = (csv["r_jail_out"] - csv["r_jail_in"]).dt.total_seconds().div(86400).fillna(0) .apply(lambda x: ceil(x))
csv["days_r"] = (csv["r_offense_date"] - csv["compas_screening_date"]).apply(lambda  x: x.days)

csv["race"] =  csv["race"].apply(lambda x: 1 if (x == "African-American") else 0)

filters = [ 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'current_jailtime', 'previous_jailtime', 
       'c_charge_degree', 'is_recid', 
       'r_charge_degree', 'decile_score.1', 'score_text']

# filter entries to ones that have valid predictions, and were screened 30 days between preditction, and have date of pred less than date of the crime
# https://mlr3fairness.mlr-org.com/reference/compas.html
csv = csv[
       ( csv["is_recid"] != -1) & (abs(csv["days_b_screening_arrest"] ) < 30)
       & (csv["c_charge_degree"] != "O")
       & (csv["decile_score"] != -1)
       & (csv["score_text"] != "NaN")
       & (csv["c_offense_date"] < csv["compas_screening_date"])] 

# remove all columns that are not used for training
csv = csv[filters]
csv.reset_index(drop=True, inplace=True)
csv.head()

Unnamed: 0,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,current_jailtime,previous_jailtime,c_charge_degree,is_recid,r_charge_degree,decile_score.1,score_text
0,Greater than 45,0,0,1,0,0,0,1,0,F,0,O,1,Low
1,25 - 45,1,0,3,0,0,0,11,0,F,1,F,3,Low
2,Less than 25,1,0,4,0,1,4,2,0,F,1,M,4,Low
3,25 - 45,0,0,6,0,0,14,7,18,F,1,F,6,Medium
4,25 - 45,0,0,1,0,0,0,3,0,M,0,O,1,Low


In [218]:

csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7048 entries, 0 to 7047
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age_cat            7048 non-null   object
 1   race               7048 non-null   int64 
 2   juv_fel_count      7048 non-null   int64 
 3   decile_score       7048 non-null   int64 
 4   juv_misd_count     7048 non-null   int64 
 5   juv_other_count    7048 non-null   int64 
 6   priors_count       7048 non-null   int64 
 7   current_jailtime   7048 non-null   int64 
 8   previous_jailtime  7048 non-null   int64 
 9   c_charge_degree    7048 non-null   object
 10  is_recid           7048 non-null   int64 
 11  r_charge_degree    7048 non-null   object
 12  decile_score.1     7048 non-null   int64 
 13  score_text         7048 non-null   object
dtypes: int64(10), object(4)
memory usage: 771.0+ KB


In [None]:
# ASSUMPTION: score_text scales linears for danger
csv["score_text"] = csv["score_text"].apply(lambda x: 0 if x == "Low" else 1 if x== "Medium" else 2 )

# convert categories to numerical representations
csv.fillna(0)
csv.head()

Unnamed: 0,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,current_jailtime,previous_jailtime,c_charge_degree,is_recid,r_charge_degree,decile_score.1,score_text
0,Greater than 45,0,0,1,0,0,0,1,0,F,0,O,1,0
1,25 - 45,1,0,3,0,0,0,11,0,F,1,F,3,0
2,Less than 25,1,0,4,0,1,4,2,0,F,1,M,4,0
3,25 - 45,0,0,6,0,0,14,7,18,F,1,F,6,1
4,25 - 45,0,0,1,0,0,0,3,0,M,0,O,1,0


In [220]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7048 entries, 0 to 7047
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age_cat            7048 non-null   object
 1   race               7048 non-null   int64 
 2   juv_fel_count      7048 non-null   int64 
 3   decile_score       7048 non-null   int64 
 4   juv_misd_count     7048 non-null   int64 
 5   juv_other_count    7048 non-null   int64 
 6   priors_count       7048 non-null   int64 
 7   current_jailtime   7048 non-null   int64 
 8   previous_jailtime  7048 non-null   int64 
 9   c_charge_degree    7048 non-null   object
 10  is_recid           7048 non-null   int64 
 11  r_charge_degree    7048 non-null   object
 12  decile_score.1     7048 non-null   int64 
 13  score_text         7048 non-null   int64 
dtypes: int64(11), object(3)
memory usage: 771.0+ KB


In [221]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categories = ["age_cat", "c_charge_degree", "r_charge_degree"]
csv = pd.get_dummies(csv, columns=categories, drop_first=True)

scaler = StandardScaler()

standardized_df = pd.DataFrame(scaler.fit_transform(csv[csv.columns]))

standardized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.978383,-0.128289,-1.140949,-0.176331,-0.208100,-0.627030,-0.278976,-0.205807,-0.718576,-1.140949,-0.768686,1.894345,-0.531840,-0.775066,-0.551779,0.718122
1,1.022094,-0.128289,-0.436088,-0.176331,-0.208100,-0.627030,-0.133310,-0.205807,1.391642,-0.436088,-0.768686,-0.527887,-0.531840,-0.775066,-0.551779,-1.392521
2,1.022094,-0.128289,-0.083657,-0.176331,1.923715,0.261988,-0.264410,-0.205807,1.391642,-0.083657,-0.768686,-0.527887,1.880264,-0.775066,1.812319,-1.392521
3,-0.978383,-0.128289,0.621204,-0.176331,-0.208100,2.484531,-0.191576,0.187420,1.391642,0.621204,0.535213,-0.527887,-0.531840,-0.775066,-0.551779,-1.392521
4,-0.978383,-0.128289,-1.140949,-0.176331,-0.208100,-0.627030,-0.249843,-0.205807,-0.718576,-1.140949,-0.768686,-0.527887,-0.531840,1.290213,-0.551779,0.718122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7043,-0.978383,-0.128289,-1.140949,-0.176331,-0.208100,-0.627030,-0.278976,-0.205807,-0.718576,-1.140949,-0.768686,-0.527887,-0.531840,1.290213,-0.551779,0.718122
7044,1.022094,-0.128289,-1.140949,-0.176331,-0.208100,0.484242,-0.278976,0.668031,1.391642,-1.140949,-0.768686,1.894345,-0.531840,1.290213,1.812319,-1.392521
7045,-0.978383,-0.128289,-0.436088,-0.176331,-0.208100,-0.404775,-0.264410,-0.205807,-0.718576,-0.436088,-0.768686,1.894345,-0.531840,-0.775066,-0.551779,0.718122
7046,-0.978383,-0.128289,0.973634,6.952425,10.450974,0.039733,-0.264410,-0.205807,1.391642,0.973634,0.535213,-0.527887,1.880264,1.290213,1.812319,-1.392521
