In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from catboost import Pool, CatBoostClassifier, cv
pd.options.display.max_columns
pd.set_option('display.max_columns', None)

# Data Prep

In [None]:
X_train = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/Kaggle-COMPAS/main/train/X_train.csv', index_col='id')
y_train = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/Kaggle-COMPAS/main/train/y_train.csv', squeeze=True)
X_test = pd.read_csv ('https://raw.githubusercontent.com/mkleinbort/Kaggle-COMPAS/main/test/X_test.csv')
y_test = np.full(len(X_test), np.nan)

In [None]:
df = pd.concat([X_train, X_test])

df = df.drop(['v_screening_date'], axis=1) # duplicate of 'screening_date'
df = df.drop(['v_type_of_assessment'], axis=1) # duplicate of 'type_of_assessment'
df = df.drop(['type_of_assessment'], axis=1) # 0 variance

df['target'] = y_train.to_list() + list(y_test) # set target row-wise (ignoring indices)
# df['target'] = 0

In [None]:
# Engineer a new targets 
df['target_int'] = df['target'].replace({'No-Recidivism': 0, 'Non-Violent': 1, 'Violent': 2}) # so we can do regression
df['target_bool'] = df['target'].replace({'No-Recidivism': 0, 'Non-Violent': 1, 'Violent': 1}) # 0 - no-rec, 1 - rec
target_cols = ['target', 'target_int', 'target_bool']

In [None]:
# Engineer new date based features

"""
Custody: when someone is kept in prison until they go to court
Screening:  involves  using  a  brief  instrument  to  quickly  capture basic information 
    about a person’s risk to reoffend and is  used  to  determine  if  a  more  comprehensive  assessment
    is  warranted.  In  a  jail  setting,  everyone,  regardless  of  legal  status, should be screened 
    at booking.  Risk  screening  divides  the  jail  population  into  high-,  medium-,  and  low-risk 
    categories,  making  it  possible  to  direct  intervention  resources  first  to  the highest-risk individuals.



- c_arrest_date and c_offense_date are mutually exclusive and one of them is 99.8% likely to be filled
- c_arrest_date and c_are mutually exclusive
- start is number of days between c_jail_in and c_jail_out, maybe worth engineering total hours/minutes between
- screening_date is always filled
"""



days_cols = []

def days_between(d1, d2):
    try:
        try:
            d1 = datetime.strptime(d1, "%Y-%m-%d")
        except ValueError:
            d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
            
        try:
            d2 = datetime.strptime(d2, "%Y-%m-%d")
        except ValueError:
            d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
        return abs((d2 - d1).days)
    except TypeError as e:
        return None

# mutually exclusive and one of them is always filled
df['c_bad_date'] = df['c_arrest_date'].fillna(df['c_offense_date']).fillna(df['in_custody']).fillna(df['screening_date'])

# how many days in custody
df['custody_days'] = df.apply(lambda x: days_between(x.out_custody, x.in_custody), axis=1)

# how quickly after going into custody did the person get screened (might indicate something?)
df['in_custody_to_screening_days'] = df.apply(lambda x: days_between(x.in_custody, x.screening_date), axis=1)

# questionable col because it is age related... Also corr=1 with age
df['birth_to_bad_days'] = df.apply(lambda x: days_between(x.date_of_birth, x.c_bad_date), axis=1)

# How many days passed until recitivism occcured. Golden feature!
df['days_until_recitivism'] = df.apply(lambda x: days_between(x.c_jail_out, x.r_jail_in), axis=1)

date_cols = [ 
    'c_arrest_date', 
    'c_offense_date',
    'screening_date', 
    'in_custody', 
    'out_custody', 
    'date_of_birth',
    'c_jail_in',
    'c_jail_out',
    'r_jail_in',
    'r_jail_out',

    # engineered
    'c_bad_date'
]

duration_cols = [
    'days_b_screening_arrest',
    'custody_days',
    'in_custody_to_screening_days',
    'birth_to_bad_days',
    'days_until_recitivism',
]

df.head()

In [None]:
# All prior counts
df['all_priors'] = df[[
    'juv_fel_count',
    'juv_misd_count', 
    'juv_other_count',
    'priors_count',
]].sum(axis=1)

In [None]:
# Has recidivism occured?
df['has_r_jail_in'] = df['r_jail_in'].notna()
df['has_r_jail_out'] = df['r_jail_out'].notna()

In [None]:
# fill missing values
df['c_charge_desc'] = df['c_charge_desc'].fillna('missing') 

In [None]:
df.head()

In [None]:
##########################################
### Select features to use in modeling ###
##########################################

cols = [
#     'name', useless
#     'first', useless
#     'last', useless
#     'sex', # 0 importance + potential bias
#     'date_of_birth', date
#     'age', # innapropriate bias
#     'age_group', # innapropriate bias
#     'race', # innapropriate bias
#     'juv_fel_count',
#     'juv_misd_count', 
#     'juv_other_count',
#     'priors_count', 
    'days_b_screening_arrest', # + AUC, - fairness
#     'c_jail_in', date
#     'c_jail_out',date
#     'c_offense_date', date
#     'c_arrest_date', date
#     'c_charge_degree',  # 0 importance
#     'c_charge_desc',
#     'r_jail_in', date
#     'r_jail_out', date
#     'screening_date', date
#     'in_custody', date
#     'out_custody', date
    'start', # + AUC, + fairness
#     'target', target
#     'target_int', target
#     'target_bool', target
#     'c_bad_date', date
    'custody_days', # + AOC, + fairness
    'in_custody_to_screening_days', # + AUC, + fairness
#     'birth_to_bad_days', # corr=1 with age
#     'has_r_jail_in', + AUC, - fairness
#     'has_r_jail_out', # corr=1 with r_jail_in
    'days_until_recitivism', # golden feature!
    'all_priors', # the most racist/sexist/ageist feature, but without it it is even more unfair
]

cat_cols = [
#     'sex',  # innapropriate bias
#     'age_group', # innapropriate bias
#     'race', # innapropriate bias
#     'c_charge_degree', # 0 importance
#     'c_charge_desc',
]

In [None]:
# Count missing values
df[cols].isna().sum()/len(df[cols])

In [None]:
# Numeric correlations across numeric features
f, ax = plt.subplots(figsize=(16, 12))
corr = df[cols].corr()
mask = np.triu(corr)
sns.heatmap(corr.select_dtypes('number'), annot=True, center=0, mask=mask)

In [None]:
df[cols].head()

In [None]:
# df['race'] = df['race'].replace({'Native American': 'Other'})
# df['race'] = df['race'].replace({'Native American': 'Other', 'Asian': 'Other'})
# df['race'] = df['race'].replace({'Native American': 'Other', 'Asian': 'Other', 'Hispanic': 'Other'})
df['race'].value_counts()

In [None]:
df['age'] = df['age_group'] # we don't care about the actual age anyway
df['race_age'] = df['race'] + df['age_group']
df['race_sex'] = df['race'] + df['sex']
df['sex_age'] = df['sex'] + df['age_group']
df['race_sex_age'] = df['race'] + df['sex'] + df['age_group']
df['target_race_sex_age'] = df['target'] + df['race'] + df['sex'] + df['age_group']

In [None]:
races = df.race.unique()
sexes = df.sex.unique()
age_groups = df.age_group.unique()

In [None]:
# Set hyperparamets
hyperparams = dict(
    iterations=100,
    learning_rate=1,
    depth=5,
    loss_function='MultiClass'
)

In [None]:
df_train = df[df.target.notna()]
df_test = df[df.target.isna()]

In [None]:
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))[1:]

powerset(['race', 'sex', 'age'])

# Cross Validation

In [None]:
from sklearn.metrics import roc_auc_score

def roc_auc_score_calm(y_true, y_score, *, average="macro", sample_weight=None,
                  max_fpr=None, multi_class="raise", labels=None):
    try:
        return roc_auc_score(**locals())
    except ValueError:
        return None

# Oversampling

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from numpy import where
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot

#define dataset
X, y = make_classification(n_samples=100000,
                           flip_y=0,
                           random_state=1,
                          )

#summarises class distribution
counter = Counter(y)
print(counter)

#transforms dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

df = df.fillna(0)
df['target_race_sex_age'] = pd.get_dummies(df["target_race_sex_age"]).to_numpy()

In [None]:
df_x_oversampled, df_y_oversampled = oversample.fit_resample(df[cols].round(), df.target_race_sex_age)
x_df = pd.DataFrame(df_x_oversampled, columns=cols)
y_df = pd.DataFrame(df_y_oversampled)
dfs_x_y = [x_df, y_df]

df_merged = pd.concat(dfs_x_y)
df_merged= df_merged.fillna(0)
# df_merged.sex = df_merged.sex.astype('int')
# df_merged.age = df_merged.age.astype('int')
# df_merged.race = df_merged.race.astype('int')


# Try predicting race/sex/age
To see if we can, ideally we shouldn't
Feature importances also will tell us the most racist/sexist/ageist features

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import collections
from fairlearn.metrics import MetricFrame

def simple_cross_val(target_col, n_splits=5, n_repeats=5, verbose=False):
    print(f'Target col: {target_col}')
    kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    roc_aucs = []

    feature_importances = []
    for split_idx, (train, test) in enumerate(kf.split(df_train[cols], df_train[target_col])): # Stratified on race
        df_train_cv = df.iloc[train].reset_index(drop=True)
        df_eval_cv = df.iloc[test].reset_index(drop=True)

        train_dataset = Pool(data=df_train_cv[cols],
                             label=df_train_cv[target_col],
                             cat_features=cat_cols)

        eval_dataset = Pool(data=df_eval_cv[cols],
                            label=df_eval_cv[target_col],
                            cat_features=cat_cols)

        y_eval = pd.get_dummies(df_eval_cv[target_col]).to_numpy()
        # Initialize CatBoostClassifier
        model = CatBoostClassifier(**hyperparams)
        # Fit model
        model.fit(train_dataset, verbose=False)
    #     # Get predicted probabilities for each class
        preds_proba = model.predict_proba(eval_dataset)

        roc_auc = roc_auc_score(y_eval, preds_proba, multi_class="ovo")
        roc_aucs.append(roc_auc)

        feature_importances.append(model.get_feature_importance())
        # avg_precs.append(avg_prec)
        if verbose:
            print(f'Split {split_idx+1:2}: ROC-AUC: {roc_auc*100:.2f}%. ')

    # Evaluation

    print(f'ROC-AUC {np.mean(roc_aucs)*100:.2f}% (STD: {np.std(roc_aucs)*100:.2f}%)')

    # Feature importances
    fi_df = pd.DataFrame({'feature': cols})
    fi_df['importance'] = np.mean(feature_importances, axis=0)
    fi_df['std'] = np.std(feature_importances, axis=0)
    fi_df['min'] = np.min(feature_importances, axis=0)
    fi_df['max'] = np.max(feature_importances, axis=0)
    fi_df = fi_df.sort_values('importance', ascending=False)
#     display(fi_df)
    display(fi_df[['feature', 'importance']].sort_values('importance', ascending=True).plot.barh(x='feature'))
    
simple_cross_val('race')
simple_cross_val('sex')
simple_cross_val('age_group')

# Predict target

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import collections
from fairlearn.metrics import MetricFrame

TARGET_COL = 'target'

N_SPLITS = 5
N_REPEATS = 10

# Stratify based on intersections of race, age_group, sex and target.
strat_df = pd.DataFrame(df_train[TARGET_COL].astype('str') + df_train['race_sex_age'], columns=['name'])

# If cross section is too small for stratification to work (N < N_SPLITS) we mark it as an outlier
strat_df.loc[strat_df.groupby('name').name.transform('count').lt(N_SPLITS), 'name'] = 'Outliers'    

kf = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=42)

roc_aucs = []
fairnesses = []


feature_importances = []
for split_idx, (train, test) in enumerate(kf.split(df_train[cols], strat_df['name'])): # Stratified on race
    df_train_cv = df.iloc[train].reset_index(drop=True)
    df_eval_cv = df.iloc[test].reset_index(drop=True)
    
    train_dataset = Pool(data=df_train_cv[cols],
                         label=df_train_cv[TARGET_COL],
                         cat_features=cat_cols)

    eval_dataset = Pool(data=df_eval_cv[cols],
                        label=df_eval_cv[TARGET_COL],
                        cat_features=cat_cols)
    
    y_eval = pd.get_dummies(df_eval_cv[TARGET_COL]).to_numpy()
    # Initialize CatBoostClassifier
    model = CatBoostClassifier(**hyperparams)
    # Fit model
    model.fit(train_dataset, verbose=False)
#     # Get predicted probabilities for each class
    preds_proba = model.predict_proba(eval_dataset)
    
    roc_auc = roc_auc_score(y_eval, preds_proba, multi_class="ovo")
    roc_aucs.append(roc_auc)
    
    group_roc_aucs = []
    for s in powerset(['race', 'sex', 'age']):
        group_roc_aucs.append(MetricFrame(roc_auc_score_calm, y_eval, preds_proba, sensitive_features=df_eval_cv['_'.join(s)]).by_group)
    fairness = 1 - np.sqrt(pd.concat(group_roc_aucs).std())
    fairnesses.append(fairness)
        
    feature_importances.append(model.get_feature_importance())
    # avg_precs.append(avg_prec)
    print(f'Split {split_idx+1:2}: ROC-AUC: {roc_auc*100:.2f}%. '
          f'Fairness: {fairness*100:.2f}%')

In [None]:
# Evaluation

print(f'ROC-AUC {np.mean(roc_aucs)*100:.2f}% (STD: {np.std(roc_aucs)*100:.2f}%)') # 90.48
print(f'Fairness: {np.mean(fairnesses)*100:.2f}% (STD: {np.std(fairnesses)*100:.2f}%)') # 83.92

score = 0.5 * np.mean(roc_aucs) + \
        0.5 * (np.mean(fairnesses))
print(f'Score: {score*100:.2f}%') # 90.48


# Full model

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import collections
from fairlearn.metrics import MetricFrame

TARGET_COL = 'target'

train_dataset = Pool(data=df_train[cols],
                     label=df_train[TARGET_COL],
                     cat_features=cat_cols)

eval_dataset = Pool(data=df_test[cols],
                    label=None,
                    cat_features=cat_cols)

# y_eval = pd.get_dummies(eval_df[TARGET_COL]).to_numpy()
# Initialize CatBoostClassifier
model = CatBoostClassifier(**hyperparams)
# Fit model
model.fit(train_dataset, verbose=True)
# Get predicted classes
preds_class = model.predict(eval_dataset)
#     # Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
#     # Get predicted RawFormulaVal
#     preds_raw = model.predict(eval_dataset, 
#                               prediction_type='RawFormulaVal')

feature_importances.append(model.get_feature_importance())

In [None]:
# Feature importances
fi_df = pd.DataFrame({'feature': cols})
fi_df['importance'] = np.mean(feature_importances, axis=0)
fi_df['std'] = np.std(feature_importances, axis=0)
fi_df['min'] = np.min(feature_importances, axis=0)
fi_df['max'] = np.max(feature_importances, axis=0)
fi_df = fi_df.sort_values('importance', ascending=False)
display(fi_df)
display(fi_df[['feature', 'importance']].sort_values('importance', ascending=True).plot.barh(x='feature'))

In [None]:
out_cols = ['No-Recidivism', 'Non-Violent', 'Violent']
out_df = pd.DataFrame(preds_proba, columns=out_cols)

# Make sure that classes are in the right order and the highest prediction corresponds to predicted class
out_df['class'] = out_df.apply(lambda x: out_cols[np.argmax(x[out_cols])], axis=1)
assert (out_df['class'] == preds_class.ravel()).all()
out_df = out_df[out_cols]

out_df.to_csv('y_test.csv', index=False)

import urllib.parse
upload_link = urllib.parse.quote('share.streamlit.io/mkleinbort/kaggle-compas/main/app.py')
y_test_link = urllib.parse.quote('y_test.csv')
print(f'Predictions were exported to "y_test.csv". Upload them at https://{upload_link}')

In [None]:
out_df

# Evaluations log
## 30/03/2021 22:18 JK
### Columns
['days_b_screening_arrest',
 'start',
 'custody_days',
 'in_custody_to_screening_days',
 'days_until_recitivism',
 'all_priors']
 
### Hyperparams
```
iterations=100,
learning_rate=1, (BEST)
depth=3, (BEST)
loss_function='MultiClass'
```

## Results
### CV
```
ROC-AUC 88.65% (STD: 1.68%)
Fairness: 73.54% (STD: 3.81%)
Score: 81.10%
```

### Online
```
You scored: 87.20% in accuracy and 68.96% in fairness.

Overall Score: 78.08%
```

# Baysian Optimisation

### Optimising using Expected Improvement

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from skopt import gp_minimize

# defining the kernel for the Gaussian process
kernel = Matern(length_scale=1.0)
TARGET_COL = 'target_int'

X = df_train[cols].fillna(0)
y = df_train[TARGET_COL]

# initialise number of queries
N_QUERY = 5

# initialise the regressor
gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
gpr.score(X, y)
    

In [None]:
from skopt.plots import plot_convergence

res = gp_minimize()
plot_convergence(res);

In [None]:
from bayes_opt import BayesianOptimization










# import matplotlib.pyplot as plt

# with plt.style.context('seaborn-white'):
#     plt.figure(figsize=(10, 5))
#     plt.scatter(optimizer.X_training, optimizer.y_training, c='k', s=50, label='Queried')
#     plt.scatter(X_max, y_max, s=100, c='r', label='Current optimum')
#     plt.plot(X.ravel(), y, c='k', linewidth=2, label='Function')
#     plt.plot(X.ravel(), y_pred, label='GP regressor')
#     plt.fill_between(X.ravel(), y_pred - y_std, y_pred + y_std, alpha=0.5)
#     plt.title('First five queries of Bayesian optimization')
#     plt.legend()
#     plt.show()
