In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
import sys
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('https://github.com/jnin/information-systems/raw/main/data/compas_ai2.csv')

df['Severity'] = df['DecileScore'] > df['DecileScore'].median()
df.drop(columns = ['DecileScore'], inplace=True)

X = df.drop(columns=["Severity"])
y = df['Severity']

numerical_features = ['YearOfBirth']
categorical_features = ['Agency','Gender','Ethnic','ScaleSet','LegalStatus','CustodyStatus','MaritalStatus','DisplayText']
transformer = ColumnTransformer([('categorical_transformations', OneHotEncoder(sparse=False), categorical_features),
                                ('numerical_transformations', KBinsDiscretizer(encode='ordinal'), numerical_features)], 
                                remainder='passthrough')

In [None]:
pipeline_steps = [('preprocess', transformer),
                  ('scaler', StandardScaler()),
                  ('xgboost',XGBClassifier(use_label_encoder=False, disable_default_eval_metric=True))]


pipe = Pipeline(pipeline_steps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, test_size = 0.1, random_state = 42)


def cross_validation_compas(compas_pipe, X_train, y_train):
    
    return cross_val_score(compas_pipe, X_train, y_train, cv=5, scoring = 'roc_auc').mean()

In [None]:
cross_validation_compas(pipe, X_train, y_train)

In [None]:
def grid_search_compas(compas_pipe, parameters, X_train, X_test, y_train, y_test):
    
    CV_GS = GridSearchCV(compas_pipe, parameters, cv=5, verbose = 3, scoring = 'roc_auc', return_train_score = True, )
    CV_GS.fit(X_train, y_train)
    
    return CV_GS.best_estimator_.score(X_test, y_test), CV_GS

In [None]:
parameters = { 
'xgboost__max_depth' : [2, 6, 20],
'xgboost__n_estimators' : [10, 100]   
}

score, grid = grid_search_compas(pipe, parameters, X_train, X_test, y_train, y_test)

In [None]:
pd.DataFrame({
     'max_depth': {0: 2, 1: 2, 2: 6, 3: 6, 4: 20, 5: 20},
     'n_estimators': {0: 10, 1: 100, 2: 10, 3: 100, 4: 10, 5: 100},
     'mean_train_score': {0: 0.8550474223586788,
                          1: 0.8873265879037542,
                          2: 0.9002253881327207,
                          3: 0.910929370639835,
                          4: 0.9178935505438173,
                          5: 0.9274086768755755},
     'mean_test_score':  {0: 0.8549714902093231,
                          1: 0.8862446860845479,
                          2: 0.8968701337253802,
                          3: 0.8968171100368609,
                          4: 0.8908078571115927,
                          5: 0.8829054959715282}})

In [None]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = pd.concat((cv_results['params'].apply(pd.Series), cv_results[['mean_train_score', 'mean_test_score']]), axis=1)
cv_results

In [None]:
parameters = { 
'preprocess__numerical_transformations__n_bins' : [2,20],
'xgboost__max_depth' : [2, 6, 20],
'xgboost__n_estimators' : [10, 100]   
}

score_pre, grid_pre = grid_search_compas(pipe, parameters, X_train, X_test, y_train, y_test)

