In [1]:
%load_ext autoreload
%autoreload 2

from copy import deepcopy as copy
import datetime
import json
from math import ceil
import multiprocessing
import logging
import operator
import os
from pathlib import Path
import random
import sys
import time
import typing
import warnings

import joblib
from joblib import delayed, Parallel
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from numpy.core.numeric import outer
import pandas as pd
from scipy.stats import mode, entropy
import seaborn as sns
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.metrics
from sklearn.metrics import roc_auc_score
import sklearn.model_selection
import typer
from tqdm.auto import tqdm

root = str(Path.cwd().parents[1])
sys.path.append(root)
os.chdir(root)
import src

# Scenario A

In [2]:
iterator = [(max_depth, fold) for max_depth in np.arange(1, 11) for fold in range(5)]

In [9]:
%%time
X, y, s = src.get_Xys()
y = y>1
ys = y.astype(str) + s.astype(int).astype(str)
skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)

y_scores = list()
s_scores = list()

for train_index, test_index in tqdm(skf.split(X=X, y=ys), total=5):
    ships_train = X.iloc[train_index].index
    ships_test = X.iloc[test_index].index

    X_train, X_test = X.loc[ships_train], X.loc[ships_test]
    y_train, y_test = y.loc[ships_train], y.loc[ships_test]
    s_train, s_test = s.loc[ships_train], s.loc[ships_test]

    X_train = np.ascontiguousarray(X_train.values)
    y_train = np.ascontiguousarray(y_train.values.ravel())
    s_train = np.ascontiguousarray(s_train.values.ravel())
    X_test = np.ascontiguousarray(X_test.values)
    y_test = np.ascontiguousarray(y_test.values.ravel())
    s_test = np.ascontiguousarray(s_test.values.ravel())

    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_train)
    X_train = vt.transform(X_train)
    X_test = vt.transform(X_test) 

    clf = src.FairRandomForestClassifier(
        max_depth=6, n_estimators=100, orthogonality=0, n_jobs=100
    )    
    clf.fit(X_train, y_train, s_train)
    y_score = clf.predict_proba(X_test)[:,1]
    y_scores.append(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score))
    s_scores.append(sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score))

  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 10.9 s, sys: 9.03 s, total: 19.9 s
Wall time: 4min 34s


In [15]:
print(f'y: {np.mean(y_scores):.3f} +- {np.std(y_scores):.3f}')
print(f's: {np.mean(s_scores):.3f} +- {np.std(s_scores):.3f}')

y: 0.669 +- 0.021
s: 0.658 +- 0.013


In [25]:
X, y, s = src.get_Xys()
y = y>1
ys = y.astype(str) + s.astype(int).astype(str)
skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)

def fit_predict(X, y, s, train_indices, test_indices):
    ships_train = X.iloc[train_index].index
    ships_test = X.iloc[test_index].index

    X_train, X_test = X.loc[ships_train], X.loc[ships_test]
    y_train, y_test = y.loc[ships_train], y.loc[ships_test]
    s_train, s_test = s.loc[ships_train], s.loc[ships_test]

    X_train = np.ascontiguousarray(X_train.values)
    y_train = np.ascontiguousarray(y_train.values.ravel())
    s_train = np.ascontiguousarray(s_train.values.ravel())
    X_test = np.ascontiguousarray(X_test.values)
    y_test = np.ascontiguousarray(y_test.values.ravel())
    s_test = np.ascontiguousarray(s_test.values.ravel())

    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_train)
    X_train = vt.transform(X_train)
    X_test = vt.transform(X_test) 

    clf = src.FairRandomForestClassifier(
        max_depth=6, n_estimators=100, orthogonality=0, n_jobs=100
    )    
    clf.fit(X_train, y_train, s_train)
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)   
    return auc_y, auc_s
    
train_indices, test_indices = sklearn.model_selection.train_test_split(np.arange(len(X)), stratify=ys)

In [28]:
%%time
fit_predict(X, y, s, train_index, test_indices)

CPU times: user 2.63 s, sys: 3.55 s, total: 6.18 s
Wall time: 1min 19s


(0.6486286241207754, 0.6671322027395556)

In [29]:
X, y, s = src.get_Xys()
y = y>1
ys = y.astype(str) + s.astype(int).astype(str)
skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)

def fit_predict(X, y, s, train_indices, test_indices):
    ships_train = X.iloc[train_index].index
    ships_test = X.iloc[test_index].index

    X_train, X_test = X.loc[ships_train], X.loc[ships_test]
    y_train, y_test = y.loc[ships_train], y.loc[ships_test]
    s_train, s_test = s.loc[ships_train], s.loc[ships_test]

    X_train = np.ascontiguousarray(X_train.values)
    y_train = np.ascontiguousarray(y_train.values.ravel())
    s_train = np.ascontiguousarray(s_train.values.ravel())
    X_test = np.ascontiguousarray(X_test.values)
    y_test = np.ascontiguousarray(y_test.values.ravel())
    s_test = np.ascontiguousarray(s_test.values.ravel())

    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_train)
    X_train = vt.transform(X_train)
    X_test = vt.transform(X_test) 

    clf = src.FairRandomForestClassifier(
        max_depth=6, n_estimators=100, orthogonality=0, n_jobs=10
    )    
    clf.fit(X_train, y_train, s_train)
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)   
    return auc_y, auc_s
    
train_indices, test_indices = sklearn.model_selection.train_test_split(np.arange(len(X)), stratify=ys)

In [30]:
%%time
fit_predict(X, y, s, train_indices, test_indices)

CPU times: user 1.32 s, sys: 576 ms, total: 1.9 s
Wall time: 3min 38s


(0.6486286241207754, 0.6671322027395556)

In [31]:
%%time
# Takes way too long >10m
src.ProgressParallel(n_jobs=5, total=5)(
    src.delayed(fit_predict)(X, y, s, train_index, test_indices)
    for train_indices, test_indices in skf.split(X=X, y=ys)
)

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 