# Learn

In [1]:
import json
import os
from pathlib import Path
import sys
import time

import joblib
import numpy as np
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.metrics
import sklearn.model_selection
from tqdm.auto import tqdm

root = str(Path.cwd().parents[1])
sys.path.append(root)
os.chdir(root)
import src

In [None]:
X, y, s = src.get_Xys()
y = y>0
ys = y + 3*s

skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
folds = list(skf.split(X=X, y=ys))

iterator = [
    (orthogonality, int(max_depth), n_bins, fold, train_idx, test_idx) 
    for n_bins in [2, 10]
    for orthogonality in np.linspace(0, 1, 11)
    for max_depth in np.arange(1, 11)
    for fold, (train_idx, test_idx) in enumerate(folds)
]

count = 0
for ortho, max_depth, n_bins, fold, train_idx, test_idx in tqdm(iterator, 
                                                                mininterval=0, 
                                                                miniters=1):
    assert isinstance(max_depth, int)
    assert isinstance(n_bins, int)
    filepath = f'cache/FRF_v3_B/{max_depth}-{ortho:.2f}-{n_bins}-{fold}'
    if not os.path.isfile(filepath + '.pkl'):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        s_train, s_test = s[train_idx], s[test_idx]        
        
        vt = sklearn.feature_selection.VarianceThreshold()
        vt.fit(X_train)
        X_train = vt.transform(X_train)
        X_test = vt.transform(X_test)
        
        clf = src.FairRandomForestClassifier(
            max_depth=max_depth, 
            n_estimators=100, 
            orthogonality=ortho, 
            n_jobs=100, 
            n_bins=n_bins
        ) 
        start = time.time()
        clf.fit(X_train, y_train, s_train)
        joblib.dump(clf, filepath + '.pkl')
        fit_time = time.time() - start  

# Predict

In [None]:
import os

import joblib
import numpy as np
import sklearn.feature_selection
import sklearn.model_selection
from tqdm.auto import tqdm

import src

X, y, s = src.get_Xys()
y = y>0
ys = y + 3*s

skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
folds = list(skf.split(X=X, y=ys))

iterator = [
    (orthogonality, int(max_depth), n_bins, fold, train_idx, test_idx) 
    for orthogonality in np.linspace(0, 1, 11)
    for max_depth in np.arange(1, 11)
    for n_bins in [2, 10]
    for fold, (train_idx, test_idx) in enumerate(folds)
]

for ortho, max_depth, n_bins, fold, train_idx, test_idx in tqdm(iterator):
    assert isinstance(max_depth, int)
    assert isinstance(n_bins, int)
    filepath = f'cache/FRF_v3_B/{max_depth}-{ortho:.2f}-{n_bins}-{fold}'

    if not os.path.isfile(filepath + '.npy'):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        s_train, s_test = s[train_idx], s[test_idx]

        vt = sklearn.feature_selection.VarianceThreshold()
        vt.fit(X_train)
        X_train = vt.transform(X_train)
        X_test = vt.transform(X_test) 

        assert os.path.isfile(filepath + '.pkl')
        clf = joblib.load(filepath + '.pkl')
        y_score = clf.predict_proba(X_test)[:,1] #type: ignore
        np.save(filepath + '.npy', y_score)

# Measure

In [None]:
import json
import os

import joblib
import numpy as np
import sklearn.model_selection
from tqdm.auto import tqdm

import src

overwrite=True

X, y, s = src.get_Xys()
y = y>0
ys = y + 3*s

skf = sklearn.model_selection.StratifiedKFold(shuffle=True, random_state=42)
folds = list(skf.split(X=X, y=ys))

iterator = [
    (ortho, int(max_depth), n_bins, fold, train_idx, test_idx) 
    for ortho in np.linspace(0, 1, 11)
    for max_depth in np.arange(1, 11)
    for n_bins in [2]
    for fold, (train_idx, test_idx) in enumerate(folds)
]

results = list()

for ortho, max_depth, n_bins, fold, train_idx, test_idx in tqdm(iterator):
    assert isinstance(max_depth, int)
    assert isinstance(n_bins, int)
    filepath = f'cache/FRF_v3_B/{max_depth}-{ortho:.2f}-{n_bins}-{fold}'

    if not os.path.isfile(filepath + '.json') or overwrite:
        y_test, s_test = y[test_idx], s[test_idx]

        assert os.path.isfile(filepath + '.npy')
        y_score = np.load(filepath + '.npy')
        
        measures = src.get_measures(y_test, y_score, s_test)
        
        result_this_run_ = {
            'fold': fold,
            'max_depth': max_depth,
            'orthogonality': ortho,
            'n_bins': n_bins,
            **measures
        }
        with open(filepath + '.json', 'w') as file:
            json.dump(result_this_run_, file)
    else:
        with open(filepath + '.json') as file:
            result_this_run_ = json.load(file)
    results.append(result_this_run_)
joblib.dump(results, 'cache/FRF_v3_B_non-nested.pkl')