In [None]:
import statsmodels.formula.api as smf
import pandas as pd

In [None]:
from google.colab import drive
from google.colab import auth
from google.auth import default
import os

drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/MyDrive/Colab Notebooks/CPSC 381-581: Machine Learning/FinalProject')

Mounted at /content/drive/


In [None]:
# Load dataset and clean
df = pd.read_csv('Recidivism.csv')

# Convert Recidivism into a Numeric Value
df['Recidivism_Within_3years'] = df['Recidivism_Within_3years'].map({True: 1, False: 0})

# Convert drug results into a Bool
df['DrugTests_THC_Positive'] = df['DrugTests_THC_Positive'].apply(lambda x: x >= 0.1)
df['DrugTests_Cocaine_Positive'] = df['DrugTests_Cocaine_Positive'].apply(lambda x: x >= 0.1)
df['DrugTests_Meth_Positive'] = df['DrugTests_Meth_Positive'].apply(lambda x: x >= 0.1)

# Convert Dependents to Yes or No
dummies = pd.get_dummies(df['Dependents'], prefix='dependents')
has_kid = 1 - dummies['dependents_0']
df = pd.concat([df, has_kid], axis=1)
df.rename(columns={'dependents_0': 'Has_Kid'}, inplace=True)

In [None]:
# Create training set (80%) and testing set (20%)

train = df.sample(frac=0.8, random_state=1)
test = df.drop(train.index)

In [None]:
# Creating Logistic Model

model = smf.logit(formula="""Recidivism_Within_3years ~ Gender + Race + Prison_Offense + Education_Level + Has_Kid + Prison_Years
          + Violations_ElectronicMonitoring + Violations_Instruction + Violations_FailToReport + Violations_MoveWithoutPermission
          + Delinquency_Reports + Avg_Days_per_DrugTest + DrugTests_THC_Positive + DrugTests_Cocaine_Positive + DrugTests_Meth_Positive
          + Percent_Days_Employed + Prior_Arrest_Episodes_Felony + C(Residence_PUMA) + Condition_MH_SA + Age_at_Release
          """, data=train).fit()

# Displaying feature weights and statistical significance
model.summary()




Optimization terminated successfully.
         Current function value: 0.571792
         Iterations 6


0,1,2,3
Dep. Variable:,Recidivism_Within_3years,No. Observations:,13804.0
Model:,Logit,Df Residuals:,13737.0
Method:,MLE,Df Model:,66.0
Date:,"Wed, 08 May 2024",Pseudo R-squ.:,0.1594
Time:,01:56:11,Log-Likelihood:,-7893.0
converged:,True,LL-Null:,-9389.2
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.6033,0.326,-1.853,0.064,-1.241,0.035
Gender[T.M],0.5649,0.060,9.433,0.000,0.447,0.682
Race[T.WHITE],0.1179,0.046,2.548,0.011,0.027,0.209
Prison_Offense[T.Other],0.1806,0.069,2.634,0.008,0.046,0.315
Prison_Offense[T.Property],0.2718,0.053,5.122,0.000,0.168,0.376
Prison_Offense[T.Violent/Non-Sex],0.2866,0.062,4.641,0.000,0.166,0.408
Prison_Offense[T.Violent/Sex],-0.3651,0.123,-2.976,0.003,-0.606,-0.125
Education_Level[T.High School Diploma],0.1929,0.056,3.434,0.001,0.083,0.303
Education_Level[T.Less than HS diploma],-0.0011,0.058,-0.019,0.985,-0.116,0.113


In [None]:
# Check proportion

print(f"Original Proportion of Recidivism: {round(train['Recidivism_Within_3years'].value_counts(normalize=True)[0], 4)}")

# Training Set Accuracy

train_predictions = model.predict(train)
train_predictions = train_predictions.apply(lambda x: x >= 0.5)
train_predictions = train_predictions.map({True: 1, False: 0})

train_correctness = train['Recidivism_Within_3years'] == train_predictions

print(f"Train Correctness: {round(train_correctness.value_counts(normalize=True)[0], 4)}")


# Testing Set Accuracy

test_predictions = model.predict(test)
test_predictions = test_predictions.apply(lambda x: x >= 0.5)
test_predictions = test_predictions.map({True: 1, False: 0})

test_correctness = test['Recidivism_Within_3years'] == test_predictions

print(f"Test Correctness: {round(test_correctness.value_counts(normalize=True)[0], 4)}")



Original Proportion of Recidivism: 0.4215
Train Correctness: 0.6101
Test Correctness: 0.6123


In [None]:
test_white = df[df['Race'] == "BLACK"]
test_black = df[df['Race'] == "WHITE"]

predictions_white = model.predict(test_white)
predictions_white = predictions_white.apply(lambda x: x >= 0.5)
predictions_white = predictions_white.map({True: 1, False: 0})
correctness_white = test_white['Recidivism_Within_3years'] == predictions_white
print(f"Correctness White: {round(correctness_white.value_counts(normalize=True)[0], 4)}")

predictions_black = model.predict(test_black)
predictions_black = predictions_black.apply(lambda x: x >= 0.5)
predictions_black = predictions_black.map({True: 1, False: 0})
correctness_black = test_black['Recidivism_Within_3years'] == predictions_black
print(f"Correctness Black: {round(correctness_black.value_counts(normalize=True)[0], 4)}")

print(f"Difference: {round(correctness_white.value_counts(normalize=True)[0] - correctness_black.value_counts(normalize=True)[0], 4)}")





Correctness White: 0.6013
Correctness Black: 0.623
Difference: -0.0218
