In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

Assumptions made 
1. Use recidivism in favor of violent_recidivism 

In [69]:
import os
filename = "compas-scores.csv"
assert(filename in os.listdir())

original_columns =  ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'num_r_cases',
       'r_case_number', 'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases', 'vr_case_number',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date']

date_cols = ["c_jail_in", "c_jail_out", "r_jail_in", "r_jail_out", "r_offense_date",  "v_screening_date", "screening_date" ]

columns = [ 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 
       'r_charge_degree', 'r_days_from_arrest',
       'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out',
       'is_violent_recid', 'num_vr_cases',
       'vr_charge_degree', 'vr_charge_desc',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date']

csv = pd.read_csv(filename, usecols=columns, parse_dates=date_cols)
csv.head()

Unnamed: 0,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,...,vr_charge_degree,vr_charge_desc,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,type_of_assessment,decile_score.1,score_text,screening_date
0,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,...,,,Risk of Violence,1,Low,2013-08-14,Risk of Recidivism,1,Low,2013-08-14
1,25 - 45,Caucasian,0,5,0,0,0,,NaT,NaT,...,,,Risk of Violence,2,Low,2014-12-31,Risk of Recidivism,5,Medium,2014-12-31
2,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,...,(F3),Felony Battery (Dom Strang),Risk of Violence,1,Low,2013-01-27,Risk of Recidivism,3,Low,2013-01-27
3,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,...,,,Risk of Violence,3,Low,2013-04-14,Risk of Recidivism,4,Low,2013-04-14
4,Less than 25,African-American,0,8,1,0,1,,NaT,NaT,...,,,Risk of Violence,6,Medium,2013-01-13,Risk of Recidivism,8,High,2013-01-13


In [63]:
np.random.seed(40)

csv["current_jailtime"] = (csv["c_jail_out"] - csv["c_jail_in"]).dt.total_seconds().div(84000).fillna(0) 
csv["previous_jailtime"] = (csv["r_jail_out"] - csv["r_jail_in"]).dt.total_seconds().div(84000).fillna(0) 
csv.drop(labels=["r_jail_in", "r_jail_out", "c_jail_in", "c_jail_out"], axis=1)

Unnamed: 0,id,compas_screening_date,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,...,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,type_of_assessment,decile_score.1,score_text,screening_date,current_jailtime,previous_jailtime
0,1,2013-08-14,Male,Greater than 45,Other,0,1,0,0,0,...,Risk of Violence,1,Low,2013-08-14,Risk of Recidivism,1,Low,2013-08-14,1.012595,0.0
1,2,2014-12-31,Male,25 - 45,Caucasian,0,5,0,0,0,...,Risk of Violence,2,Low,2014-12-31,Risk of Recidivism,5,Medium,2014-12-31,0.000000,0.0
2,3,2013-01-27,Male,25 - 45,African-American,0,3,0,0,0,...,Risk of Violence,1,Low,2013-01-27,Risk of Recidivism,3,Low,2013-01-27,10.365310,0.0
3,4,2013-04-14,Male,Less than 25,African-American,0,4,0,1,4,...,Risk of Violence,3,Low,2013-04-14,Risk of Recidivism,4,Low,2013-04-14,1.116786,0.0
4,5,2013-01-13,Male,Less than 25,African-American,0,8,1,0,1,...,Risk of Violence,6,Medium,2013-01-13,Risk of Recidivism,8,High,2013-01-13,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11752,11753,2013-09-22,Male,Greater than 45,Other,0,3,0,0,1,...,Risk of Violence,1,Low,2013-09-22,Risk of Recidivism,3,Low,2013-09-22,1.207298,0.0
11753,11754,2013-05-17,Male,Less than 25,Caucasian,0,7,3,5,3,...,Risk of Violence,5,Medium,2013-05-17,Risk of Recidivism,7,Medium,2013-05-17,2.014940,0.0
11754,11755,2014-10-08,Male,25 - 45,Other,0,4,0,0,0,...,Risk of Violence,3,Low,2014-10-08,Risk of Recidivism,4,Low,2014-10-08,0.000000,0.0
11755,11756,2013-12-03,Male,25 - 45,Caucasian,0,3,0,0,2,...,Risk of Violence,2,Low,2013-12-03,Risk of Recidivism,3,Low,2013-12-03,1.279548,0.0
