In [None]:
# %%
import os
import numpy as np
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

# Show all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [None]:
# %%
# Load the dataset
df = pd.read_csv("data/compas-scores-two-years.csv")


In [None]:
# %%
# Data Cleaning

# Remove duplicates based on 'id'
df = df.drop_duplicates(subset="id")

# Standardize date formats
date_cols = [
    "compas_screening_date",
    "dob",
    "c_offense_date",
    "c_arrest_date",
    "r_offense_date",
    "vr_offense_date",
    "screening_date",
    "v_screening_date",
    "c_jail_in",
    "c_jail_out",
    "r_jail_in",
    "r_jail_out",
]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# Correct inconsistent entries
df["sex"] = df["sex"].str.capitalize()
df["race"] = df["race"].str.capitalize()
df["c_charge_degree"] = df["c_charge_degree"].str.upper()
df["r_charge_degree"] = df["r_charge_degree"].str.upper()
df["vr_charge_degree"] = df["vr_charge_degree"].str.upper()
df["score_text"] = df["score_text"].str.capitalize()
df["v_score_text"] = df["v_score_text"].str.capitalize()

# Remove irrelevant columns
irrelevant_cols = [
    "name",
    "first",
    "last",
    "c_case_number",
    "r_case_number",
    "vr_case_number",
    "decile_score.1",
    "priors_count.1",
]
df = df.drop(columns=irrelevant_cols)


In [None]:
# %%
# Handling Missing Values

# Columns with no missing data
complete_cols = [
    "id",
    "sex",
    "dob",
    "age",
    "age_cat",
    "race",
    "juv_fel_count",
    "decile_score",
    "juv_misd_count",
    "juv_other_count",
    "priors_count",
    "c_charge_degree",
    "is_recid",
    "is_violent_recid",
    "type_of_assessment",
    "score_text",
    "screening_date",
    "v_type_of_assessment",
    "v_decile_score",
    "v_score_text",
    "v_screening_date",
    "priors_count",
    "start",
    "end",
    "event",
    "two_year_recid",
]

# Columns with minimal missing data (<5%)
minimal_missing_cols = [
    "days_b_screening_arrest",
    "c_jail_in",
    "c_jail_out",
    "c_case_number",
    "c_days_from_compas",
    "c_charge_desc",
    "in_custody",
    "out_custody",
]
for col in minimal_missing_cols:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Columns with moderate missing data (5%-50%)
moderate_missing_cols = [
    "c_offense_date",
    "r_case_number",
    "r_charge_degree",
    "r_days_from_arrest",
    "r_offense_date",
    "r_charge_desc",
    "r_jail_in",
    "r_jail_out",
]
for col in moderate_missing_cols:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Columns with high missing data (>50%)
high_missing_cols = [
    "c_arrest_date",
    "violent_recid",
    "vr_case_number",
    "vr_charge_degree",
    "vr_offense_date",
    "vr_charge_desc",
]
df = df.drop(columns=high_missing_cols)


In [None]:
# %%
# Feature Selection and Engineering

# Selection of relevant variables
selected_features = [
    "age",
    "age_cat",
    "sex",
    "race",
    "juv_fel_count",
    "juv_misd_count",
    "juv_other_count",
    "priors_count",
    "c_charge_degree",
    "c_charge_desc",
    "decile_score",
    "score_text",
    "v_decile_score",
    "v_score_text",
    "is_recid",
    "two_year_recid",
    "is_violent_recid",
    "in_custody",
    "out_custody",
    "compas_screening_date",
    "c_offense_date",
    "screening_date",
    "v_screening_date",
]

# Feature Engineering
df["age_at_screening"] = (df["compas_screening_date"] - df["dob"]).dt.days // 365
df["time_in_jail"] = (df["c_jail_out"] - df["c_jail_in"]).dt.days
df["days_from_screening_to_arrest"] = (
    df["compas_screening_date"] - df["c_arrest_date"]
).dt.days
df["recidivism_within_two_years"] = df["two_year_recid"]

# Normalization and Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = [
    "age",
    "priors_count",
    "juv_fel_count",
    "juv_misd_count",
    "juv_other_count",
    "age_at_screening",
    "time_in_jail",
    "days_from_screening_to_arrest",
]
df[num_cols] = scaler.fit_transform(df[num_cols])

# Handling Categorical Variables
cat_cols = ["sex", "race", "c_charge_degree", "score_text", "v_score_text"]
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[cat_cols])
encoded_feature_names = encoder.get_feature_names_out(cat_cols)
df_encoded = pd.DataFrame(
    encoded_features, columns=encoded_feature_names, index=df.index
)

# Combine encoded categorical features with numerical features
df_final = pd.concat([df[num_cols], df_encoded, df["two_year_recid"]], axis=1)


In [None]:
# %%
# Correlation Analysis
corr_matrix = df_final.corr()

plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", vmin=-1, vmax=1, center=0)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

# Focus on correlations with the target variable
target_correlations = corr_matrix["two_year_recid"].sort_values(ascending=False)
print("Top correlations with two_year_recid:")
print(target_correlations)

plt.figure(figsize=(12, 8))
sns.barplot(x=target_correlations.index, y=target_correlations.values)
plt.title("Feature Correlations with two_year_recid")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# %%
# Check performance of COMPAS model
print(classification_report(y_true=df["two_year_recid"], y_pred=df["compas_is_recid"]))
