In [84]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

Assumptions made 
1. Use recidivism in favor of violent_recidivism 
2. Drop violent recidivismm statistics in prediction and training

In [85]:
import os
filename = "compas-scores.csv"
assert(filename in os.listdir())

original_columns =  ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'num_r_cases',
       'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date']

date_cols = ["c_jail_in", "c_jail_out", "r_jail_in", "r_jail_out", "r_offense_date", "compas_screening_date"]

csv = pd.read_csv(filename, usecols=original_columns, parse_dates=date_cols)
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 46 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       11757 non-null  int64         
 1   name                     11757 non-null  object        
 2   first                    11757 non-null  object        
 3   last                     11757 non-null  object        
 4   compas_screening_date    11757 non-null  datetime64[ns]
 5   sex                      11757 non-null  object        
 6   dob                      11757 non-null  object        
 7   age                      11757 non-null  int64         
 8   age_cat                  11757 non-null  object        
 9   race                     11757 non-null  object        
 10  juv_fel_count            11757 non-null  int64         
 11  decile_score             11757 non-null  int64         
 12  juv_misd_count           11757 n

In [None]:
np.random.seed(40)

csv["current_jailtime"] = (csv["c_jail_out"] - csv["c_jail_in"]).dt.total_seconds().div(86400).fillna(0) 
csv["previous_jailtime"] = (csv["r_jail_out"] - csv["r_jail_in"]).dt.total_seconds().div(86400).fillna(0) 
csv["days_r"] = (csv["r_offense_date"] - csv["compas_screening_date"]).apply(lambda  x: x.days)

filters = [ 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'c_days_from_compas',
       'c_charge_degree', 'is_recid', 
       'r_charge_degree', 'r_days_from_arrest',
       'decile_score.1', 'score_text']

# filter entries to ones that have valid predictions, and were screened 30 days between preditction, and have date of pred less than date of the crime
# https://mlr3fairness.mlr-org.com/reference/compas.html
csv = csv[
       ( csv["is_recid"] != -1) & (abs(csv["days_b_screening_arrest"] ) < 30)
       & (csv["c_charge_degree"] != "O")
       & (csv["decile_score"] != -1)
       & (csv["score_text"] != "NaN")
       & (csv["c_offense_date"] < csv["compas_screening_date"])] 

# remove all columns that are not used for training
csv = csv[filters]
csv.head()

Unnamed: 0,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_days_from_compas,c_charge_degree,is_recid,r_charge_degree,r_days_from_arrest,decile_score.1,score_text
0,Greater than 45,Other,0,1,0,0,0,1.0,F,0,O,,1,Low
2,25 - 45,African-American,0,3,0,0,0,1.0,F,1,F,,3,Low
3,Less than 25,African-American,0,4,0,1,4,1.0,F,1,M,0.0,4,Low
7,25 - 45,Caucasian,0,6,0,0,14,1.0,F,1,F,0.0,6,Medium
9,25 - 45,Caucasian,0,1,0,0,0,1.0,M,0,O,,1,Low


In [87]:

csv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7048 entries, 0 to 11756
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age_cat             7048 non-null   object 
 1   race                7048 non-null   object 
 2   juv_fel_count       7048 non-null   int64  
 3   decile_score        7048 non-null   int64  
 4   juv_misd_count      7048 non-null   int64  
 5   juv_other_count     7048 non-null   int64  
 6   priors_count        7048 non-null   int64  
 7   c_days_from_compas  7048 non-null   float64
 8   c_charge_degree     7048 non-null   object 
 9   is_recid            7048 non-null   int64  
 10  r_charge_degree     7048 non-null   object 
 11  r_days_from_arrest  1599 non-null   float64
 12  decile_score.1      7048 non-null   int64  
 13  score_text          7048 non-null   object 
dtypes: float64(2), int64(7), object(5)
memory usage: 825.9+ KB


In [88]:
# convert categories to numerical representations

categories = ["age_cat", "race", "c_charge_degree", "r_charge_degree", "score_text"]
csv = pd.get_dummies(csv, columns=categories)
csv.fillna(0)
csv.head()

Unnamed: 0,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_days_from_compas,is_recid,r_days_from_arrest,decile_score.1,age_cat_25 - 45,...,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M,r_charge_degree_F,r_charge_degree_M,r_charge_degree_O,score_text_High,score_text_Low,score_text_Medium
0,0,1,0,0,0,1.0,0,,1,False,...,False,True,True,False,False,False,True,False,True,False
2,0,3,0,0,0,1.0,1,,3,True,...,False,False,True,False,True,False,False,False,True,False
3,0,4,0,1,4,1.0,1,0.0,4,False,...,False,False,True,False,False,True,False,False,True,False
7,0,6,0,0,14,1.0,1,0.0,6,True,...,False,False,True,False,True,False,False,False,False,True
9,0,1,0,0,0,1.0,0,,1,True,...,False,False,False,True,False,False,True,False,True,False
