In [1]:
from copy import deepcopy as copy
import datetime
import json
from math import ceil
import multiprocessing
import logging
import operator
import os
import random
import sys
import time
import typing
import warnings

import joblib
from joblib import delayed, Parallel
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from numpy.core.numeric import outer
import pandas as pd
from scipy.stats import mode, entropy
import seaborn as sns
import sklearn.feature_selection
import sklearn.metrics
from sklearn.metrics import roc_auc_score
import sklearn.model_selection
import typer
from tqdm.auto import tqdm

os.chdir(os.path.dirname(os.getcwd()))

import src

In [2]:
portcalls = joblib.load('data/portcalls_v3.pkl')
ships = np.load('data/not_selected.npy', allow_pickle=True)

expert_labels = portcalls.groupby('ship')['risk'].max()
y_score = pd.Series({ship: expert_labels.get(ship) for ship in ships})

inspections = src.get_inspections().groupby('IMO')['WasDetained'].any().replace({False: 1, True: 2})
y_true = pd.Series({ship: inspections.get(ship, default=0) for ship in ships})

sensitive = portcalls.groupby('ship')['flag'].last().astype(int).astype(bool)
s = pd.Series({ship: sensitive.get(ship) for ship in ships})

all_ships = portcalls.ship.unique()
y_true_all = pd.Series({ship: inspections.get(ship, default=0) for ship in all_ships})
y_score_all = pd.Series({ship: expert_labels.get(ship) for ship in all_ships})
s_all = pd.Series({ship: sensitive.get(ship) for ship in all_ships})

# Distribution targets on whole data

In [3]:
y_true_all.value_counts().sort_index()

0     6743
1    21088
2     1631
dtype: int64

In [4]:
y_true_all.value_counts(normalize=True).sort_index()

0    0.228871
1    0.715769
2    0.055359
dtype: float64

# Distribution sensitive flags on whole data

In [5]:
result = {'always_pos': 0, 'always_neg': 0, 'changed': 0}
for ship, group in portcalls.groupby('ship')['flag']:
    if group.all():
        result['always_pos'] += 1
    elif (~group).all():
        result['always_neg'] += 1
    else:
        result['changed'] += 1
result = pd.Series(result)

In [6]:
result

always_pos     1473
always_neg    27260
changed         729
dtype: int64

In [7]:
result / result.sum()

always_pos    0.049997
always_neg    0.925260
changed       0.024744
dtype: float64

In [8]:
y_true_all[s_all].value_counts(normalize=True).sort_index()

0    0.195664
1    0.554204
2    0.250132
dtype: float64

In [9]:
y_true_all[~s_all].value_counts(normalize=True).sort_index()

0    0.231149
1    0.726851
2    0.042001
dtype: float64

In [10]:
assert all(y_score.index == y_score.index)
pd.DataFrame(
    data=sklearn.metrics.confusion_matrix(y_true[s], y_score[s]),
    index=pd.Index(['compliant', 'minor deficiencies', 'detention'], name='inspection result'),
    columns=pd.Index(['low', 'medium', 'high'], name='Expert label')
)

Expert label,low,medium,high
inspection result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
compliant,1,261,17
minor deficiencies,0,620,297
detention,0,150,267


In [11]:
assert all(y_score.index == y_score.index)
pd.DataFrame(
    data=sklearn.metrics.confusion_matrix(y_true[~s], y_score[~s]),
    index=pd.Index(['compliant', 'minor deficiencies', 'detention'], name='inspection result'),
    columns=pd.Index(['low', 'medium', 'high'], name='Expert label')
)

Expert label,low,medium,high
inspection result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
compliant,319,4774,49
minor deficiencies,682,16370,732
detention,1,788,246


In [12]:
print(f'PPR_nonwhite = {(y_score <= 1)[s].mean():.2f}')
print(f'PPR_white = {(y_score <= 1)[~s].mean():.2f}')
print(f'FPR_nonwhite = {(y_score <= 1)[s & ~y_true].mean():.2f}')
print(f'FPR_white = {(y_score <= 1)[~s & ~y_true].mean():.2f}')
print(f'TPR_nonwhite = {(y_score <= 1)[s & y_true].mean():.2f}')
print(f'TPR_white = {(y_score <= 1)[~s & y_true].mean():.2f}')

PPR_nonwhite = 0.64
PPR_white = 0.96
FPR_nonwhite = 0.59
FPR_white = 0.95
TPR_nonwhite = 0.68
TPR_white = 0.96


# Distribution target

In [13]:
pd.Series(np.bincount(y_true) / len(y_true), index=['OK', 'deficiency', 'detention'])

OK            0.211973
deficiency    0.731250
detention     0.056776
dtype: float64

In [14]:
pd.Series(np.bincount(y_true), index=['OK', 'deficiency', 'detention'])

OK             5421
deficiency    18701
detention      1452
dtype: int64

# Distribution sensitive attribute

In [15]:
pd.Series(np.bincount(s) / len(s), index=['white flag', 'non-white flag'])

white flag        0.936928
non-white flag    0.063072
dtype: float64

# Distribution sensitive attribute in combination with target

In [16]:
pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=s),
    columns=['white flag', 'non-white flag', ''],
    index=['OK', 'deficiency', 'detention']
).drop(columns='')

Unnamed: 0,white flag,non-white flag
OK,5142,279
deficiency,17784,917
detention,1035,417


In [17]:
pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=s, normalize='true'),
    columns=['white flag', 'non-white flag', ''],
    index=['OK', 'deficiency', 'detention']
).drop(columns='')

Unnamed: 0,white flag,non-white flag
OK,0.948533,0.051467
deficiency,0.950965,0.049035
detention,0.71281,0.28719


# Disparate impact

In [18]:
def disparate_impact(y_score, s):
    p_low_given_black = (y_score)[s.astype(bool)].mean()
    p_low_given_white = (y_score)[~s.astype(bool)].mean()
    disparate_impact = p_low_given_black/p_low_given_white
    ε = 1-disparate_impact
    print(f"{p_low_given_black:.3f}/{p_low_given_white:.3f}={disparate_impact:.3f}")
    print(f"{ε=:.3f}")

$\frac{P[\text{low risk}|\text{black flag}]}{P[\text{low risk}|\text{white flag}]} \geq 1-\epsilon$

In [19]:
disparate_impact(y_score < 1, s.astype(bool))

0.001/0.042=0.015
ε=0.985


$\frac{P[\text{low or medium risk}|\text{black flag}]}{P[\text{low or medium risk}|\text{white flag}]}$

In [20]:
disparate_impact(y_score < 2, s.astype(bool))

0.640/0.957=0.668
ε=0.332


# Demographic parity

$\left|P[\text{low risk}|\text{black flag}] - P[\text{low risk}|\text{white flag}]\right|$

In [21]:
abs((y_score == 0)[s.astype(bool)].mean() - (y_score == 0)[~s.astype(bool)].mean())

0.04119799137330338

$\left|P[\text{low or medium risk}|\text{black flag}] - P[\text{low or medium risk}|\text{white flag}]\right|$

In [22]:
abs((y_score < 2)[s.astype(bool)].mean() - (y_score < 2)[~s.astype(bool)].mean())

0.31733707179105086

# Strategy A: OK+deficiency vs detention

In [23]:
y_true_a = (y_true>1)

confustion_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true_a, y_pred=y_score), 
    columns=pd.Index(['low', 'med', 'high'], name='Expert label'), 
    index=pd.Index(['OK+def (neg.)', 'det (pos.)', ''], name='ground truth')
).loc[['OK+def (neg.)', 'det (pos.)']].round(3)
display(confustion_matrix)
confustion_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true_a, y_pred=y_score, normalize='true'), 
    columns=pd.Index(['low', 'med', 'high'], name='Expert label'), 
    index=pd.Index(['OK+def (neg.)', 'det (pos.)', ''], name='ground truth')
).loc[['OK+def (neg.)', 'det (pos.)']].round(3)
display(confustion_matrix)
print(f'AUC_y: {sklearn.metrics.roc_auc_score(y_true=y_true_a, y_score=y_score):.3f}')
print(f'AUC_s: {sklearn.metrics.roc_auc_score(y_true=s, y_score=y_score):.3f}')

print(f'AP: {sklearn.metrics.average_precision_score(y_true=y_true_a, y_score=y_score):.2f}')
print(f'Baseline performance: {y_true_a.mean():.3f}')

Expert label,low,med,high
ground truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OK+def (neg.),1002,22025,1095
det (pos.),1,938,513


Expert label,low,med,high
ground truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OK+def (neg.),0.042,0.913,0.045
det (pos.),0.001,0.646,0.353


AUC_y: 0.667
AUC_s: 0.672
AP: 0.15
Baseline performance: 0.057


## Equalized odds

In [24]:
def equalized_odds(y_score, y_true, s):
    fpr_black = (y_score)[s & ~y_true].mean()
    fpr_white = (y_score)[~s & ~y_true].mean()
    print(f"|{fpr_black=:.4f} - {fpr_white=:.4f}| = {abs(fpr_black-fpr_white):.4f}")
    tpr_black = (y_score)[s & y_true].mean()
    tpr_white = (y_score)[~s & y_true].mean()
    print(f"|{tpr_black=:.4f} - {tpr_white=:.4f}| = {abs(tpr_black-tpr_white):.4f}")

$\left| P [\text{low risk} | \text{black flag, ship with deficiencies}] - P [\text{low risk} | \text{white flag, ship with deficiencies}] \right|$ = $\left| \text{FPR}_{\text{black}} - \text{FPR}_{\text{white}} \right|$

In [25]:
equalized_odds(y_score == 0, y_true <= 1, s)

|fpr_black=0.0000 - fpr_white=0.0010| = 0.0010
|tpr_black=0.0008 - tpr_white=0.0437| = 0.0428


$\left| P [\text{low or medium risk} | \text{black flag, ship with deficiencies}] - P [\text{low or medium risk} | \text{white flag, ship with deficiencies}] \right|$ = $\left| \text{FPR}_{\text{black}} - \text{FPR}_{\text{white}} \right|$

In [26]:
equalized_odds(y_score <= 1, y_true <= 1, s)

|fpr_black=0.3597 - fpr_white=0.7623| = 0.4026
|tpr_black=0.7375 - tpr_white=0.9659| = 0.2285


# Strategy B: OK vs def+det

In [27]:
y_true_b = (y_true>0)
y_score_b = expert_labels.loc[y_true.index]

assert (y_score.index == y_true_b.index).all()
confustion_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true_b, y_pred=y_score_b), 
    columns=pd.Index(['low', 'med', 'high'], name='Expert label'), 
    index=pd.Index(['OK (negative)', 'def+det (positive)', ''], name='ground truth')
).loc[['OK (negative)', 'def+det (positive)']].round(3)
display(confustion_matrix)
confustion_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_true=y_true, y_pred=y_score_b, normalize='true'), 
    columns=pd.Index(['low', 'med', 'high'], name='Expert label'), 
    index=pd.Index(['OK (negative)', 'def+det (positive)', ''], name='ground truth')
).loc[['OK (negative)', 'def+det (positive)']].round(3)
display(confustion_matrix)
print(f'AUC_y: {sklearn.metrics.roc_auc_score(y_true=y_true_b, y_score=y_score_b):.3f}')
print(f'AUC_s: {sklearn.metrics.roc_auc_score(y_true=s, y_score=y_score_b):.3f}')

print(f'AP: {sklearn.metrics.average_precision_score(y_true=y_true_b, y_score=y_score_b):.2f}')
print(f'Baseline performance: {y_true.mean():.2f}')

Expert label,low,med,high
ground truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OK (negative),320,5035,66
def+det (positive),683,17928,1542


Expert label,low,med,high
ground truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OK (negative),0.059,0.929,0.012
def+det (positive),0.036,0.909,0.055


AUC_y: 0.543
AUC_s: 0.672
AP: 0.80
Baseline performance: 0.84


## Equalized odds

$\left| P [\text{low risk} | \text{black flag, not OK ship}] - P [\text{low risk} | \text{white flag, not OK ship}] \right|$ = $\left| \text{FPR}_{\text{black}} - \text{FPR}_{\text{white}} \right|$

In [28]:
equalized_odds(y_score == 0, y_true == 0, s)

|fpr_black=0.0000 - fpr_white=0.0363| = 0.0363
|tpr_black=0.0036 - tpr_white=0.0620| = 0.0585


$\left| P [\text{low or medium risk} | \text{black flag, not OK ship}] - P [\text{low or medium risk} | \text{white flag, not OK ship}] \right|$ = $\left| \text{FPR}_{\text{black}} - \text{FPR}_{\text{white}} \right|$

In [29]:
equalized_odds(y_score <= 1, y_true == 0, s)

|fpr_black=0.5772 - fpr_white=0.9480| = 0.3708
|tpr_black=0.9391 - tpr_white=0.9905| = 0.0514
