In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("https://github.com/jnin/information-systems/raw/main/data/compas_ai2.csv")
df.head()

In [None]:
def DecileScore(x):
    if x > np.median(df["DecileScore"]):
        return True
    else:
        return False

df["Severity"] = df["DecileScore"].apply(DecileScore)
df = df.drop(columns='DecileScore')

In [None]:
df.head()

In [None]:
X = df.drop(columns='Severity')
y = df['Severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

categorical_features = ["Agency", "Gender", "Ethnic", "ScaleSet", "LegalStatus", "CustodyStatus", "MaritalStatus", "DisplayText"]
numerical_features = ["YearOfBirth", "RecSupervisionLevel"]

In [None]:
ohe = OneHotEncoder(sparse=False)

preprocessing_steps = [('categorical_encoding', ohe, categorical_features)]

transformer = ColumnTransformer(preprocessing_steps, remainder = 'passthrough')

In [None]:
pipeline_steps = [('preprocess', transformer),
                  ('scaler', StandardScaler()),
                  ('RF', LogisticRegression())]

pipe = Pipeline(pipeline_steps)

pipe.fit(X_train, y_train)

In [None]:
def cross_validation_compas(compas_pipe, X_train, y_train):
    scores = cross_val_score(compas_pipe, X_train, y_train, cv=10, scoring= 'roc_auc')
    return float(scores.mean())

cross_validation_compas(pipe, X_train, y_train)

In [None]:
def compute_cm(compas_pipe, X_train, y_train, X_test, y_test):
        compas_pipe.fit(X_train, y_train)
        y_pred = compas_pipe.predict(X_test)
        matrix_array = confusion_matrix(y_test, y_pred)
        return matrix_array

In [None]:
compute_cm(pipe, X_train, y_train, X_test, y_test)

In [None]:
y_pred = pipe.predict(X_test)
matrix_array = confusion_matrix(y_test, y_pred)

In [None]:
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))