In [1]:
%load_ext autoreload
%autoreload 2

from copy import deepcopy as copy
import datetime
import json
from math import ceil
import multiprocessing
import logging
import operator
import os
from pathlib import Path
import random
import sys
import time
import typing
import warnings

import joblib
from joblib import delayed, Parallel
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from numpy.core.numeric import outer
import pandas as pd
from scipy.stats import mode, entropy
import seaborn as sns
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.metrics
from sklearn.metrics import roc_auc_score
import sklearn.model_selection
import typer
from tqdm.auto import tqdm

root = str(Path.cwd().parents[1])
sys.path.append(root)
os.chdir(root)
import src

# Scenario A

In [7]:
%%time

X, y, s = src.get_Xys()
ys = y.astype(str) + s.astype(int).astype(str)

X = np.ascontiguousarray(X.values)
y = np.ascontiguousarray(y.values.ravel())
s = np.ascontiguousarray(s.values.ravel())

y = y > 1
cv = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
fit_times = []

result = []
for trainval_indices, test_indices in tqdm(cv.split(X=X, y=ys), total=5):
    X_trainval = X[trainval_indices]
    y_trainval = y[trainval_indices]
    s_trainval = s[trainval_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    s_test = s[test_indices]
    
    result_outer_fold = {}    
    for max_depth in tqdm(np.arange(1, 21), leave=False):
        result_max_depth = list()
        for train_indices, val_indices in cv.split(X=X_trainval, y=y_trainval):                                                 
            X_train = X_trainval[train_indices]
            y_train = y_trainval[train_indices]
            s_train = s_trainval[train_indices]
            X_val = X_trainval[val_indices]
            y_val = y_trainval[val_indices]
            s_val = s_trainval[val_indices]
            
            vt = sklearn.feature_selection.VarianceThreshold()
            vt.fit(X_train)
            X_train = vt.transform(X_train)
            X_val = vt.transform(X_val) 
            
            start_fit = time.time()
            clf = sklearn.ensemble.RandomForestClassifier(
                random_state=42, n_jobs=100, max_depth=max_depth
            )
            fit_time = time.time() - start_fit
            fit_times.append({'max_depth': max_depth, 'fit_time': fit_time})
            clf.fit(X_train, y_train)
            y_score = clf.predict_proba(X_val)[:,1]
            auc_y = sklearn.metrics.roc_auc_score(y_val, y_score)
            auc_s = sklearn.metrics.roc_auc_score(s_val, y_score)
            result_max_depth.append(auc_y)
            
        result_outer_fold[max_depth] = np.mean(result_max_depth)
    best_max_depth, _ = max(result_outer_fold.items(), key=lambda x: x[1])
    
    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_trainval)
    X_trainval = vt.transform(X_trainval)
    X_test = vt.transform(X_test) 
    
    clf = sklearn.ensemble.RandomForestClassifier(
        random_state=42, n_jobs=100, max_depth=best_max_depth
    )
    clf.fit(X_trainval, y_trainval)
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)
    result.append(
        {
            'max_depth': best_max_depth,
            'auc_y': auc_y,
            'auc_s': max(auc_s, 1-auc_s)
        }
    )
result = pd.DataFrame(result)
display(result)
display(result.agg({'auc_y': ['mean', 'std'], 'auc_s': ['mean', 'std']}))
display(pd.DataFrame(fit_times).groupby('max_depth').agg(['mean', 'std']))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,max_depth,auc_y,auc_s
0,6,0.683485,0.62407
1,6,0.694266,0.616033
2,6,0.690103,0.635217
3,6,0.647148,0.63629
4,7,0.655993,0.615883


Unnamed: 0,auc_y,auc_s
mean,0.674199,0.625498
std,0.021243,0.009938


Unnamed: 0_level_0,fit_time,fit_time
Unnamed: 0_level_1,mean,std
max_depth,Unnamed: 1_level_2,Unnamed: 2_level_2
1,0.000577,0.00024
2,0.000488,0.000132
3,0.0006,0.000216
4,0.0007,0.000327
5,0.000674,0.000244
6,0.000718,0.000325
7,0.000612,0.000248
8,0.000684,0.000309
9,0.000612,0.000261
10,0.000658,0.000369


CPU times: user 1h 54min 45s, sys: 2min 54s, total: 1h 57min 39s
Wall time: 9min 57s


# Scenario B

In [8]:
%%time

X, y, s = src.get_Xys()
ys = y.astype(str) + s.astype(int).astype(str)

X = np.ascontiguousarray(X.values)
y = np.ascontiguousarray(y.values.ravel())
s = np.ascontiguousarray(s.values.ravel())

y = y > 0
cv = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
fit_times = []

result = []
for trainval_indices, test_indices in tqdm(cv.split(X=X, y=ys), total=5):
    X_trainval = X[trainval_indices]
    y_trainval = y[trainval_indices]
    s_trainval = s[trainval_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    s_test = s[test_indices]
    
    result_outer_fold = {}    
    for max_depth in tqdm(np.arange(1, 21), leave=False):
        result_max_depth = list()
        for train_indices, val_indices in cv.split(X=X_trainval, y=y_trainval):                                                 
            X_train = X_trainval[train_indices]
            y_train = y_trainval[train_indices]
            s_train = s_trainval[train_indices]
            X_val = X_trainval[val_indices]
            y_val = y_trainval[val_indices]
            s_val = s_trainval[val_indices]
            
            vt = sklearn.feature_selection.VarianceThreshold()
            vt.fit(X_train)
            X_train = vt.transform(X_train)
            X_val = vt.transform(X_val) 
            
            start_fit = time.time()
            clf = sklearn.ensemble.RandomForestClassifier(
                random_state=42, n_jobs=100, max_depth=max_depth
            )
            fit_time = time.time() - start_fit
            fit_times.append({'max_depth': max_depth, 'fit_time': fit_time})
            clf.fit(X_train, y_train)
            y_score = clf.predict_proba(X_val)[:,1]
            auc_y = sklearn.metrics.roc_auc_score(y_val, y_score)
            auc_s = sklearn.metrics.roc_auc_score(s_val, y_score)
            result_max_depth.append(auc_y)
            
        result_outer_fold[max_depth] = np.mean(result_max_depth)
    best_max_depth, _ = max(result_outer_fold.items(), key=lambda x: x[1])
    
    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_trainval)
    X_trainval = vt.transform(X_trainval)
    X_test = vt.transform(X_test) 
    
    clf = sklearn.ensemble.RandomForestClassifier(
        random_state=42, n_jobs=100, max_depth=best_max_depth
    )
    clf.fit(X_trainval, y_trainval)
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)
    result.append(
        {
            'max_depth': best_max_depth,
            'auc_y': auc_y,
            'auc_s': max(auc_s, 1-auc_s)
        }
    )
result = pd.DataFrame(result)
display(result)
display(result.agg({'auc_y': ['mean', 'std'], 'auc_s': ['mean', 'std']}))
display(pd.DataFrame(fit_times).groupby('max_depth').agg(['mean', 'std']))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,max_depth,auc_y,auc_s
0,17,0.827673,0.586433
1,13,0.826443,0.593345
2,14,0.832576,0.576154
3,16,0.837499,0.57521
4,16,0.830249,0.577883


Unnamed: 0,auc_y,auc_s
mean,0.830888,0.581805
std,0.004389,0.007833


Unnamed: 0_level_0,fit_time,fit_time
Unnamed: 0_level_1,mean,std
max_depth,Unnamed: 1_level_2,Unnamed: 2_level_2
1,0.000491,0.000209
2,0.000534,0.000197
3,0.00055,0.000206
4,0.000531,0.000226
5,0.000755,0.000899
6,0.000567,0.000236
7,0.000468,0.000162
8,0.000559,0.000249
9,0.000746,0.000672
10,0.000648,0.000292


CPU times: user 2h 9min 15s, sys: 2min 56s, total: 2h 12min 12s
Wall time: 10min 27s
