In [1]:
# Change dir to repo root if running from repo (rather than pip installed)
# (Assuming running from [repo]/notes/)
import os
os.chdir('../')

%load_ext autoreload
%autoreload 2

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import math

from typing import Tuple

from incremental_trees.trees import StreamingRFC

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.classification import classification_report
from sklearn.base import clone

In [6]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.25,
                                                    random_state=123)

In [24]:
def fit_srfc(srfc, x, y,
             sequential: bool=True,
             n_prop: float=0.1,
             n_p_fits: int=10) -> StreamingRFC:
    """
    Fit the streaming RFC. Total number of rows used in training varies depending on sequential.

    sequential==True
    In this case, rows used per estimator scales with n_estimators. So in total 100% of rows are used for
    training once.
    If there are 10 calls, 10% of data is used in each .partial_fit call. Equivalent n rows to 1 tree.
    If 100 calls, 1% of data used in each .partial_fit call. Still equivalent n rows to 1 tree.
    This is similar to the Dask use case.

    sequential==False
    Randomly sample % of data with replacement n times.
    Set % to sample and n calls, allows over sampling to compare more directly with RandomForest.
    If there are 10 calls and 10% of data is used in each .partial_fit call: Equivalent n rows to 1 tree.
    If 100 calls, 10% of data used in each .partial_fit call: 1000% of rows used, equivalent n rows to 10 trees.

    :param sequential: If true step through all data once. If False, draw n_prop proportions of data n_draws times.
    :param n_prop: When sequential is False, use to set prop of data to draw in each .partial_fit call.
    :param n_p_fits: Number of partial_fit calls to make. 

    :return:
    """

    n_rows = x.shape[0]

    if sequential:
        # Step through all data once
        n_sample_rows = int(n_rows / n_p_fits)
        sidx = 0
        eidx = n_sample_rows
        for i in range(n_p_fits):
            idx = np.arange(sidx, eidx)
            srfc.partial_fit(x[idx, :], y[idx],
                             classes=[0, 1])
            sidx = eidx
            eidx = min(eidx + n_sample_rows, n_rows)
    else:
        # Sample n_prop of data self.srfc_n_partial_fit_calls times
        n_sample_rows = int(n_rows * n_prop)
        for i in range(n_p_fits):
            # Sample indexes with replacement
            idx = np.random.randint(0, n_rows, n_sample_rows)
            srfc.partial_fit(x[idx, :], y[idx],
                             classes=[0, 1])

    return srfc

def mod_report(mod, x_train, x_test, y_train, y_test):

    report = classification_report(y_test, mod.predict(x_test))
    train_auc = roc_auc_score(y_train, mod.predict_proba(x_train)[:, 1])
    test_auc = roc_auc_score(y_test, mod.predict_proba(x_test)[:, 1])

    print(report)
    print(f"Train AUC: {train_auc}")
    print(f"Test AUC: {test_auc}")

# 10 full trees vs equivilents
RFC: 10 tress with 100%

## vs SRFC: 10 x 1 x 0.1 vs 10
10 fits with 1 tree on 10% of data each

In [32]:
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train, y_train)
mod_report(rfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        54
           1       0.99      1.00      0.99        89

   micro avg       0.99      0.99      0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

Train AUC: 0.9998583034196108
Test AUC: 0.9886600083229296


In [33]:
srfc = fit_srfc(StreamingRFC(n_estimators_per_chunk=1),
                x_train, y_train,
                sequential=False,
                n_prop=0.1,
                n_p_fits=10)

mod_report(srfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        54
           1       0.94      0.94      0.94        89

   micro avg       0.93      0.93      0.93       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.93      0.93      0.93       143

Train AUC: 0.9814495560173814
Test AUC: 0.9889721181856014


## vs SRFC: 100 x 1 x 0.1 vs 10
100 fits with 1 tree on 10% of data each

In [34]:
srfc = fit_srfc(StreamingRFC(n_estimators_per_chunk=1),
                x_train, y_train,
                sequential=False,
                n_prop=0.1,
                n_p_fits=100)

mod_report(srfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        54
           1       0.96      0.99      0.97        89

   micro avg       0.97      0.97      0.97       143
   macro avg       0.97      0.96      0.96       143
weighted avg       0.97      0.97      0.96       143

Train AUC: 0.9757580767050822
Test AUC: 0.9916770703287556


## vs SRFC: 100 x 10 x 0.1 vs 10
100 fits with 1 tree on 10% of data each

In [35]:
srfc = fit_srfc(StreamingRFC(n_estimators_per_chunk=10),
                x_train, y_train,
                sequential=False,
                n_prop=0.1,
                n_p_fits=100)

mod_report(srfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.98      0.83      0.90        54
           1       0.91      0.99      0.95        89

   micro avg       0.93      0.93      0.93       143
   macro avg       0.94      0.91      0.92       143
weighted avg       0.93      0.93      0.93       143

Train AUC: 0.967846684300019
Test AUC: 0.9850187265917603


## vs SRFC: 100 x 1 x 0.1 vs 10 (all features per tree)
100 fits with 1 tree on 10% of data each

In [49]:
srfc = fit_srfc(StreamingRFC(n_estimators_per_chunk=1, 
                             max_features=x_train.shape[1]),
                x_train, y_train,
                sequential=False,
                n_prop=0.1,
                n_p_fits=100)

mod_report(srfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        54
           1       0.96      0.99      0.97        89

   micro avg       0.97      0.97      0.97       143
   macro avg       0.97      0.96      0.96       143
weighted avg       0.97      0.97      0.96       143

Train AUC: 0.9817801813716229
Test AUC: 0.9847066167290888


## vs SRFC: 33 x 3 x 0.1 vs 10 (sampled features per tree)
100 fits with 1 tree on 10% of data each

In [48]:
srfc = fit_srfc(StreamingRFC(n_estimators_per_chunk=3),
                x_train, y_train,
                sequential=False,
                n_prop=0.1,
                n_p_fits=33)

mod_report(srfc, x_train, x_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        54
           1       0.99      0.99      0.99        89

   micro avg       0.99      0.99      0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

Train AUC: 0.9844606083506519
Test AUC: 0.9887640449438202
