In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics

import src

In [2]:
def equalised_odds(y_true, y_pred, s):
    fpr_diff = abs(y_pred[s & ~y_true].mean() - y_pred[~s & ~y_true].mean())
    tpr_diff = abs(y_pred[s & y_true].mean() - y_pred[~s & y_true].mean())
    return max(fpr_diff, tpr_diff)

In [14]:
portcalls = pd.read_pickle('data/portcalls_v3.pkl').astype({'risk': int})
ships = np.load('data/not_selected.npy', allow_pickle=True)

expert_labels = portcalls.groupby('ship')['risk'].max()
y_score = pd.Series({ship: expert_labels.get(ship) for ship in ships}) > 1 # 0 low, 1 medium, 2 high risk

inspections = src.get_inspections().groupby('IMO')['WasDetained'].any().replace({False: 1, True: 2})
y_true = pd.Series({ship: inspections.get(ship, default=0) for ship in ships}) > 0 # 0 compliant, 1 deficiency, 2 detention
y_true_bincount = np.bincount(y_true)

sensitive = portcalls.groupby('ship')['flag'].last().astype(int).astype(bool)
s = pd.Series({ship: sensitive.get(ship) for ship in ships})

assert all(y_score.index == y_score.index)
no_instances = len(y_score)
assert no_instances == len(y_true)
assert no_instances == len(s)

result = [
    {'model': 'baseline',      'measure': 'precision (non-white)', 'value': sklearn.metrics.precision_score(y_true[s], y_score[s])},
    {'model': 'baseline',      'measure': 'precision (white)',     'value': sklearn.metrics.precision_score(y_true[~s], y_score[~s])},  
    {'model': 'baseline',      'measure': 'recall (non-white)',    'value': sklearn.metrics.recall_score(y_true[s], y_score[s])}, 
    {'model': 'baseline',      'measure': 'recall (white)',        'value': sklearn.metrics.recall_score(y_true[~s], y_score[~s])},
    {'model': 'baseline',      'measure': 'f1 (non-white)',        'value': sklearn.metrics.f1_score(y_true[s], y_score[s])},
    {'model': 'baseline',      'measure': 'f1 (white)',            'value': sklearn.metrics.f1_score(y_true[~s], y_score[~s])},
    {'model': 'baseline',      'measure': 'demographic parity',    'value': abs(y_score[s].mean()-y_score[~s].mean())},
    {'model': 'baseline',      'measure': 'equalised odds',        'value': equalised_odds(y_true, y_score, s)},
]

y_score = 1-np.load('cache/y_score_randomforest.npy')
y_true = ~np.load('cache/y_true_randomforest.npy')
s = np.load('cache/s_randomforest.npy')
threshold = np.quantile(y_score, np.mean(~y_true))
y_pred = y_score > threshold

assert len(y_score) == no_instances
assert len(y_true) == no_instances
assert (y_true_bincount == np.bincount(y_true)).all()
assert (y_true_bincount == np.bincount(y_pred)).all()

result_randomforest = [
    {'model': 'random forest', 'measure': 'precision (non-white)', 'value': sklearn.metrics.precision_score(y_true[s], y_pred[s])},
    {'model': 'random forest', 'measure': 'precision (white)',     'value': sklearn.metrics.precision_score(y_true[~s], y_pred[~s])},  
    {'model': 'random forest', 'measure': 'recall (non-white)',    'value': sklearn.metrics.recall_score(y_true[s], y_pred[s])}, 
    {'model': 'random forest', 'measure': 'recall (white)',        'value': sklearn.metrics.recall_score(y_true[~s], y_pred[~s])},
    {'model': 'random forest', 'measure': 'f1 (non-white)',        'value': sklearn.metrics.f1_score(y_true[s], y_pred[s])},
    {'model': 'random forest', 'measure': 'f1 (white)',            'value': sklearn.metrics.f1_score(y_true[~s], y_pred[~s])},
    {'model': 'random forest', 'measure': 'demographic parity',    'value': abs(y_pred[s].mean()-y_pred[~s].mean())},
    {'model': 'random forest', 'measure': 'equalised odds',        'value': equalised_odds(y_true, y_pred, s)},
]
result.extend(result_randomforest)

y_score = 1-np.load('cache/y_score_fairrandomforest.npy')
y_true = ~np.load('cache/y_true_fairrandomforest.npy')
s = np.load('cache/s_fairrandomforest.npy')
threshold = np.quantile(y_score, np.mean(~y_true))
y_pred = y_score > threshold

assert len(y_score) == no_instances
assert len(y_true) == no_instances
assert (y_true_bincount == np.bincount(y_true)).all()
assert (y_true_bincount == np.bincount(y_pred)).all()

result_fairrandomforest = [
    {'model': 'fair random forest', 'measure': 'precision (non-white)', 'value': sklearn.metrics.precision_score(y_true[s], y_pred[s])},
    {'model': 'fair random forest', 'measure': 'precision (white)',     'value': sklearn.metrics.precision_score(y_true[~s], y_pred[~s])},  
    {'model': 'fair random forest', 'measure': 'recall (non-white)',    'value': sklearn.metrics.recall_score(y_true[s], y_pred[s])}, 
    {'model': 'fair random forest', 'measure': 'recall (white)',        'value': sklearn.metrics.recall_score(y_true[~s], y_pred[~s])}, 
    {'model': 'fair random forest', 'measure': 'f1 (non-white)',        'value': sklearn.metrics.f1_score(y_true[s], y_pred[s])},
    {'model': 'fair random forest', 'measure': 'f1 (white)',            'value': sklearn.metrics.f1_score(y_true[~s], y_pred[~s])},
    {'model': 'fair random forest', 'measure': 'demographic parity',    'value': abs(y_pred[s].mean()-y_pred[~s].mean())},
    {'model': 'fair random forest', 'measure': 'equalised odds',        'value': equalised_odds(y_true, y_pred, s)},
]
result.extend(result_fairrandomforest)

rows = ['precision (non-white)', 'precision (white)', 'recall (non-white)', 'recall (white)', 'f1 (non-white)', 'f1 (white)', 'demographic parity', 'equalised odds']
cols = ['baseline', 'random forest', 'fair random forest']
pd.DataFrame(result).pivot(index='measure', columns='model', values='value').loc[rows, cols].round(3)

model,baseline,random forest,fair random forest
measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
precision (non-white),0.971,0.898,0.89
precision (white),0.952,0.877,0.861
recall (non-white),0.423,0.755,0.825
recall (white),0.052,0.886,0.866
f1 (non-white),0.589,0.82,0.856
f1 (white),0.099,0.882,0.864
demographic parity,0.317,0.099,0.023
equalised odds,0.371,0.132,0.04
