In [1]:
%load_ext autoreload
%autoreload 2

from copy import deepcopy as copy
import datetime
import json
from math import ceil
import multiprocessing
import logging
import operator
import os
from pathlib import Path
import random
import sys
import time
import typing
import warnings

import joblib
from joblib import delayed, Parallel
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from numpy.core.numeric import outer
import pandas as pd
from scipy.stats import mode, entropy
import seaborn as sns
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.metrics
from sklearn.metrics import roc_auc_score
import sklearn.model_selection
import typer
from tqdm.auto import tqdm

root = str(Path.cwd().parents[1])
sys.path.append(root)
os.chdir(root)
import src

X, y, s = src.get_Xys()

# Gridsearch over max_depth

## Scenario A

In [5]:
%%time
X, y, s = src.get_Xys()
y = y > 1
cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
ys = y.astype(str) + s.astype(int).astype(str)
splits = list(cv.split(X=X, y=ys))
iterations = [
    (X, y, s, splits, fold, max_depth)
    for fold in range(5)
    for max_depth in np.arange(1, 11)
]

def fit_score(X, y, s, train_indices, test_indices, fold, max_depth):
    clf = sklearn.ensemble.RandomForestClassifier(
        n_jobs=4, 
        random_state=42,
        max_depth=max_depth
    )
    
    ships_train = X.iloc[train_indices].index
    ships_test = X.iloc[test_indices].index

    X_train, X_test = X.loc[ships_train], X.loc[ships_test]
    y_train, y_test = y.loc[ships_train], y.loc[ships_test]
    s_train, s_test = s.loc[ships_train], s.loc[ships_test]

    X_train = np.ascontiguousarray(X_train.values)
    y_train = np.ascontiguousarray(y_train.values.ravel())
    s_train = np.ascontiguousarray(s_train.values.ravel())
    X_test = np.ascontiguousarray(X_test.values)
    y_test = np.ascontiguousarray(y_test.values.ravel())
    s_test = np.ascontiguousarray(s_test.values.ravel())
    
    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_train)
    X_train = vt.transform(X_train)
    X_test = vt.transform(X_test) 
    
    start_time = time.time()
    clf.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)
    
    return {
        'fold': fold,
        'max_depth': max_depth,
        'auc_y': auc_y,
        'auc_s': auc_s,
        'fit_time': fit_time
    }

results = src.ProgressParallel(n_jobs=50, total=50)(
    src.delayed(fit_score)(X, y, s, train_indices, test_indices, fold, max_depth)
    for fold, (train_indices, test_indices) in enumerate(splits)
    for max_depth in np.arange(1, 11)
)

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 2.43 s, sys: 1.36 s, total: 3.79 s
Wall time: 22.1 s


In [11]:
data = (
    pd.DataFrame(results).groupby(['max_depth']).agg(
        auc_y_mean=('auc_y', 'mean'),
        auc_y_std=('auc_y', 'std'),
        auc_s_mean=('auc_s', 'mean'),
        auc_s_std=('auc_s', 'std'),
        fit_time_mean=('fit_time', 'mean')
    )
)
data

Unnamed: 0_level_0,auc_y_mean,auc_y_std,auc_s_mean,auc_s_std,fit_time_mean
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.651923,0.007665,0.59325,0.012724,2.109964
2,0.659505,0.00803,0.598051,0.011549,2.252647
3,0.66493,0.008123,0.608145,0.01301,3.539674
4,0.669452,0.009862,0.617056,0.011491,4.855431
5,0.672589,0.010452,0.623509,0.01456,5.666489
6,0.671843,0.008265,0.626936,0.014082,6.679214
7,0.672847,0.011747,0.628576,0.009584,7.147799
8,0.670411,0.011456,0.630841,0.011468,8.157577
9,0.666714,0.009347,0.629596,0.012255,8.238589
10,0.66245,0.012486,0.62712,0.015969,8.812917


In [14]:
data.sort_values('auc_y_mean', ascending=False)

Unnamed: 0_level_0,auc_y_mean,auc_y_std,auc_s_mean,auc_s_std,fit_time_mean
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,0.672847,0.011747,0.628576,0.009584,7.147799
5,0.672589,0.010452,0.623509,0.01456,5.666489
6,0.671843,0.008265,0.626936,0.014082,6.679214
8,0.670411,0.011456,0.630841,0.011468,8.157577
4,0.669452,0.009862,0.617056,0.011491,4.855431
9,0.666714,0.009347,0.629596,0.012255,8.238589
3,0.66493,0.008123,0.608145,0.01301,3.539674
10,0.66245,0.012486,0.62712,0.015969,8.812917
2,0.659505,0.00803,0.598051,0.011549,2.252647
1,0.651923,0.007665,0.59325,0.012724,2.109964


## Scenario B

In [15]:
%%time
X, y, s = src.get_Xys()
y = y > 0
cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
ys = y.astype(str) + s.astype(int).astype(str)
splits = list(cv.split(X=X, y=ys))
iterations = [
    (X, y, s, splits, fold, max_depth)
    for fold in range(5)
    for max_depth in np.arange(1, 11)
]

def fit_score(X, y, s, train_indices, test_indices, fold, max_depth):
    clf = sklearn.ensemble.RandomForestClassifier(
        n_jobs=4, 
        random_state=42,
        max_depth=max_depth
    )
    
    ships_train = X.iloc[train_indices].index
    ships_test = X.iloc[test_indices].index

    X_train, X_test = X.loc[ships_train], X.loc[ships_test]
    y_train, y_test = y.loc[ships_train], y.loc[ships_test]
    s_train, s_test = s.loc[ships_train], s.loc[ships_test]

    X_train = np.ascontiguousarray(X_train.values)
    y_train = np.ascontiguousarray(y_train.values.ravel())
    s_train = np.ascontiguousarray(s_train.values.ravel())
    X_test = np.ascontiguousarray(X_test.values)
    y_test = np.ascontiguousarray(y_test.values.ravel())
    s_test = np.ascontiguousarray(s_test.values.ravel())
    
    vt = sklearn.feature_selection.VarianceThreshold()
    vt.fit(X_train)
    X_train = vt.transform(X_train)
    X_test = vt.transform(X_test) 
    
    start_time = time.time()
    clf.fit(X_train, y_train)
    fit_time = time.time() - start_time
    
    y_score = clf.predict_proba(X_test)[:,1]
    auc_y = sklearn.metrics.roc_auc_score(y_true=y_test, y_score=y_score)
    auc_s = sklearn.metrics.roc_auc_score(y_true=s_test, y_score=y_score)
    
    return {
        'fold': fold,
        'max_depth': max_depth,
        'auc_y': auc_y,
        'auc_s': auc_s,
        'fit_time': fit_time
    }

results = src.ProgressParallel(n_jobs=50, total=50)(
    src.delayed(fit_score)(X, y, s, train_indices, test_indices, fold, max_depth)
    for fold, (train_indices, test_indices) in enumerate(splits)
    for max_depth in np.arange(1, 11)
)

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 5.02 s, sys: 643 ms, total: 5.67 s
Wall time: 15.5 s


In [16]:
data = (
    pd.DataFrame(results).groupby(['max_depth']).agg(
        auc_y_mean=('auc_y', 'mean'),
        auc_y_std=('auc_y', 'std'),
        auc_s_mean=('auc_s', 'mean'),
        auc_s_std=('auc_s', 'std'),
        fit_time_mean=('fit_time', 'mean')
    )
)
data

Unnamed: 0_level_0,auc_y_mean,auc_y_std,auc_s_mean,auc_s_std,fit_time_mean
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.755859,0.007934,0.462161,0.019205,1.470736
2,0.768887,0.005656,0.442322,0.014437,2.229689
3,0.785064,0.005728,0.426061,0.01697,2.74483
4,0.798111,0.005474,0.41655,0.018067,3.734565
5,0.807339,0.004767,0.411186,0.01839,4.286266
6,0.813195,0.005199,0.407973,0.01953,4.983107
7,0.818262,0.006534,0.408113,0.021825,5.718615
8,0.821516,0.006712,0.409394,0.020908,6.056742
9,0.824836,0.006241,0.409821,0.02185,6.800837
10,0.826867,0.006093,0.410127,0.021445,7.306278


In [17]:
data.sort_values('auc_y_mean', ascending=False)

Unnamed: 0_level_0,auc_y_mean,auc_y_std,auc_s_mean,auc_s_std,fit_time_mean
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,0.826867,0.006093,0.410127,0.021445,7.306278
9,0.824836,0.006241,0.409821,0.02185,6.800837
8,0.821516,0.006712,0.409394,0.020908,6.056742
7,0.818262,0.006534,0.408113,0.021825,5.718615
6,0.813195,0.005199,0.407973,0.01953,4.983107
5,0.807339,0.004767,0.411186,0.01839,4.286266
4,0.798111,0.005474,0.41655,0.018067,3.734565
3,0.785064,0.005728,0.426061,0.01697,2.74483
2,0.768887,0.005656,0.442322,0.014437,2.229689
1,0.755859,0.007934,0.462161,0.019205,1.470736


# Gridsearch over max_depth and n_estimators

## Scenario A

In [4]:
%%time
clf = sklearn.model_selection.GridSearchCV(
    estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=2, random_state=42),
    param_grid={'max_depth': np.linspace(1, 10, 10), 'n_estimators': [100, 500]},
    scoring='roc_auc',
    refit=True,
    verbose=2,
    cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42),
    n_jobs=5*20,
    return_train_score=True
)
clf.fit(X, y > 1)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 1min 15s, sys: 2.71 s, total: 1min 17s
Wall time: 1min 43s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(n_jobs=2, random_state=42),
             n_jobs=100,
             param_grid={'max_depth': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
                         'n_estimators': [100, 500]},
             return_train_score=True, scoring='roc_auc', verbose=2)

In [5]:
(
    pd.DataFrame(clf.cv_results_)
    .set_index(['param_max_depth', 'param_n_estimators'])
    [['mean_test_score', 'std_test_score', 'mean_train_score']]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,std_test_score,mean_train_score
param_max_depth,param_n_estimators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,100,0.654215,0.022426,0.66471
1.0,500,0.654132,0.022052,0.663161
2.0,100,0.662827,0.019623,0.678232
2.0,500,0.661789,0.020517,0.677211
3.0,100,0.667345,0.019171,0.698953
3.0,500,0.668134,0.01973,0.698813
4.0,100,0.672,0.018851,0.73255
4.0,500,0.673268,0.019804,0.735057
5.0,100,0.674311,0.018108,0.780547
5.0,500,0.677005,0.018437,0.785489


In [6]:
clf.best_estimator_.n_jobs = 50
scores = sklearn.model_selection.cross_val_score(
    clf.best_estimator_, X, y > 1, 
    cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42),
    scoring='roc_auc'
)

In [7]:
print(f'{scores.mean():.3f} +- {scores.std():.3f}')

0.680 +- 0.016


## Scenario B

In [8]:
%%time
clf = sklearn.model_selection.GridSearchCV(
    estimator=sklearn.ensemble.RandomForestClassifier(n_jobs=2, random_state=42),
    param_grid={'max_depth': np.linspace(1, 10, 10), 'n_estimators': [100, 500]},
    scoring='roc_auc',
    refit=True,
    verbose=2,
    cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42),
    n_jobs=5*20,
    return_train_score=True
)
clf.fit(X, y > 0)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 1min 33s, sys: 4.1 s, total: 1min 38s
Wall time: 1min 50s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(n_jobs=2, random_state=42),
             n_jobs=100,
             param_grid={'max_depth': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
                         'n_estimators': [100, 500]},
             return_train_score=True, scoring='roc_auc', verbose=2)

In [9]:
(
    pd.DataFrame(clf.cv_results_)
    .set_index(['param_max_depth', 'param_n_estimators'])
    [['mean_test_score', 'std_test_score', 'mean_train_score']]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test_score,std_test_score,mean_train_score
param_max_depth,param_n_estimators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,100,0.758732,0.005056,0.762571
1.0,500,0.761042,0.008086,0.765311
2.0,100,0.772904,0.008019,0.778319
2.0,500,0.773277,0.008477,0.778731
3.0,100,0.785921,0.008533,0.79263
3.0,500,0.787877,0.007851,0.794779
4.0,100,0.798004,0.007392,0.807606
4.0,500,0.799362,0.007247,0.808773
5.0,100,0.807474,0.006834,0.821489
5.0,500,0.808089,0.006717,0.822087


In [10]:
clf.best_estimator_.n_jobs = 50
scores = sklearn.model_selection.cross_val_score(
    clf.best_estimator_, X, y > 0, 
    cv=sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42),
    scoring='roc_auc'
)

In [11]:
print(f'{scores.mean():.3f} +- {scores.std():.3f}')

0.828 +- 0.004
