In [280]:
from math import ceil
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

Assumptions made 
1. Use recidivism in favor of violent_recidivism 
2. Drop violent recidivismm statistics in prediction and training

In [281]:
import os
filename = "compas-scores-two-years.csv"
assert(filename in os.listdir())

columns = ["age", "c_charge_degree", "race", "age_cat", "score_text", "sex", "priors_count", "days_b_screening_arrest", "decile_score", "is_recid", "two_year_recid", "c_jail_in", "c_jail_out"]

date_cols = ["c_jail_in", "c_jail_out"]

csv = pd.read_csv(filename,usecols=columns, parse_dates=date_cols)
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   sex                      7214 non-null   object        
 1   age                      7214 non-null   int64         
 2   age_cat                  7214 non-null   object        
 3   race                     7214 non-null   object        
 4   decile_score             7214 non-null   int64         
 5   priors_count             7214 non-null   int64         
 6   days_b_screening_arrest  6907 non-null   float64       
 7   c_jail_in                6907 non-null   datetime64[ns]
 8   c_jail_out               6907 non-null   datetime64[ns]
 9   c_charge_degree          7214 non-null   object        
 10  is_recid                 7214 non-null   int64         
 11  score_text               7214 non-null   object        
 12  two_year_recid           7214 non-

In [282]:
np.random.seed(40)

# filter entries to ones that have valid predictions, and were screened 30 days between preditction, and have date of pred less than date of the crime
# https://mlr3fairness.mlr-org.com/reference/compas.html
csv = csv[
       ( csv["is_recid"] != -1) 
       & (abs(csv["days_b_screening_arrest"] ) < 30)
       & (csv["c_charge_degree"] != "O")
       & (csv["decile_score"] != -1)
       & (csv["score_text"] != "NaN") ] 

csv.reset_index(drop=True, inplace=True)
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6159 entries, 0 to 6158
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   sex                      6159 non-null   object        
 1   age                      6159 non-null   int64         
 2   age_cat                  6159 non-null   object        
 3   race                     6159 non-null   object        
 4   decile_score             6159 non-null   int64         
 5   priors_count             6159 non-null   int64         
 6   days_b_screening_arrest  6159 non-null   float64       
 7   c_jail_in                6159 non-null   datetime64[ns]
 8   c_jail_out               6159 non-null   datetime64[ns]
 9   c_charge_degree          6159 non-null   object        
 10  is_recid                 6159 non-null   int64         
 11  score_text               6159 non-null   object        
 12  two_year_recid           6159 non-

In [283]:
csv["age_cat"].value_counts()

age_cat
25 - 45            3523
Less than 25       1346
Greater than 45    1290
Name: count, dtype: int64

In [284]:
csv["race"].value_counts()

race
African-American    3172
Caucasian           2095
Hispanic             508
Other                342
Asian                 31
Native American       11
Name: count, dtype: int64

In [None]:
csv["current_jailtime_days"] = (csv["c_jail_out"] - csv["c_jail_in"]).dt.total_seconds().div(86400).fillna(0).apply(lambda x: ceil(x))
csv["race"] =  csv["race"].apply(lambda x: 1 if (x == "African-American") else 0)

# ASSUMPTION: score_text scales linear for danger
csv["score_text"] = csv["score_text"].apply(lambda x: 0 if x == "Low" else 1 if x== "Medium" else 2 )

# convert categories to numerical representations
csv.fillna(0)
csv.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_charge_degree,is_recid,score_text,two_year_recid,current_jailtime_days
0,Male,69,Greater than 45,0,1,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,F,0,0,0,1
1,Male,34,25 - 45,1,3,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,F,1,0,1,11
2,Male,24,Less than 25,1,4,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,F,1,0,1,2
3,Male,44,25 - 45,0,1,0,0.0,2013-11-30 04:50:18,2013-12-01 12:28:56,M,0,0,0,2
4,Male,41,25 - 45,0,6,14,-1.0,2014-02-18 05:08:24,2014-02-24 12:18:30,F,1,1,1,7


In [286]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6159 entries, 0 to 6158
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   sex                      6159 non-null   object        
 1   age                      6159 non-null   int64         
 2   age_cat                  6159 non-null   object        
 3   race                     6159 non-null   int64         
 4   decile_score             6159 non-null   int64         
 5   priors_count             6159 non-null   int64         
 6   days_b_screening_arrest  6159 non-null   float64       
 7   c_jail_in                6159 non-null   datetime64[ns]
 8   c_jail_out               6159 non-null   datetime64[ns]
 9   c_charge_degree          6159 non-null   object        
 10  is_recid                 6159 non-null   int64         
 11  score_text               6159 non-null   int64         
 12  two_year_recid           6159 non-

In [287]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

target = 'race'

# Define the categorical columns
categories = ["age_cat", "sex", "c_charge_degree", "r_charge_degree"]

# Create dummy variables for categorical columns
crime_dummies = pd.get_dummies(csv['c_charge_degree'], prefix='crime', drop_first=True)
age_dummies = pd.get_dummies(csv['age_cat'], prefix='age')
race_dummies = pd.get_dummies(csv['race'], prefix='race')
gender_dummies = pd.get_dummies(csv['sex'], prefix='gender', drop_first=True)
score_dummies = pd.get_dummies(csv['score_text'] != 'Low', prefix='score', drop_first=True)

# Concatenate dummy variables with the original DataFrame
csv = pd.concat([csv, crime_dummies, age_dummies, race_dummies, gender_dummies, score_dummies], axis=1)

# Drop the original categorical columns
csv.drop(['c_charge_degree', 'age_cat', 'race', 'sex', 'score_text'], axis=1, inplace=True)

# Handle missing values
csv.fillna(0, inplace=True)

# Define the target variable and features
target = 'is_recid'  # Assuming 'is_recid' is the target variable
X = csv.drop(target, axis=1)
y = csv[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the first few rows of the processed data
pd.DataFrame(X_train).head()

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>)