In [1]:
FOLDER_ROOT = 'F:\Github'

In [2]:
import numpy as np

import pandas as pd

#import matplotlib as mpl
#import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

import sys
sys.path.append(FOLDER_ROOT)

from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score, f1_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from joblib import Parallel, delayed

import numpy as np

from os import cpu_count

In [3]:
from midas.v3.ktree import KTreeClassifier

In [4]:
df_stream = pd.read_csv('https://github.com/scikit-multiflow/streaming-datasets/raw/master/agr_a.csv',
                        engine='c', low_memory=True, memory_map=True)

X = df_stream[df_stream.columns[:-1]]
y = df_stream['class']

In [None]:
class KTreeClassifier(BaseEstimator):
    """
        K Tree Classifier V3
        strategy = ['auto', 'score']
    """
    def __init__(self,
                 ensemble_base,
                 k_tree=25,
                 strategy="auto",
                 metric_type="roc_auc_score",
                 metric_param={},
                 voting="majority",
                 n_jobs=-1,
                 verbose=0):
        self.ensemble = ensemble_base
        self.n_tree = k_tree
        self.strategy = str(strategy).lower()
        self.metric_type = str(metric_type).lower()
        self.metric_param = metric_param
        self.voting = str(voting).lower()
        self.n_jobs = n_jobs
        self.verbose = verbose
        #
        self.metric = None
        self.args_metric = None
        self.index_k_tree = []
        self.fit_scores_ = []

        if isinstance(k_tree, float):
          self.n_tree = int(k_tree*self._get_ensemble_size())


    def _get_n_jobs(self):
        """
        Utility for to return get the number of CPUs in the system or the
        number choose from user
        """
        return (cpu_count() if self.n_jobs == -1 else self.n_jobs)
    
    def get_score_train(self, std=False):
        """
            None
        """
        if std:
            return (np.mean(self.fit_scores_), np.std(self.fit_scores_))
        return np.mean(self.fit_scores_)
    
    
    def get_index_estimators(self):
        """
            None
        """
        return list(self.index_k_tree)
    
    
    def get_nk_tree(self):
        """
            None
        """
        return self.n_tree
    
    
    def _get_ensemble_size(self):
        """
          Return of the numbers DTs in ensemble
        """
        return len(self.ensemble.estimators_)
    
    def _get_metric(self):
        """
            Decode the metric function to use for evaluable classifiers
        """
        if self.metric_type == 'accuracy_score':
            return accuracy_score
        elif self.metric_type == 'cohen_kappa_score':
            return cohen_kappa_score
        elif self.metric_type == 'f1_score':
            return f1_score
        elif self.metric_type == 'balanced_accuracy_score':
            return balanced_accuracy_score
        elif self.metric_type == 'precision_score':
            return precision_score
        elif self.metric_type == 'recall_score':
            return recall_score
        elif self.metric_type == 'roc_auc_score':
            return roc_auc_score
        else:
            raise Exception('The parameter metric_type can not empty or unknow')
    
    def _get_metric_params(self):
        dict_params = {}
        if self.metric_type == 'accuracy_score':
            dict_params = {'normalize': self.metric_param.get('normalize', True),
                           'sample_weight': self.metric_param.get('sample_weight', None)
            }
        elif self.metric_type == 'cohen_kappa_score':
            dict_params = {'labels': self.metric_param.get('labels', None),
                           'weights': self.metric_param.get('weights', None),
                           'sample_weight': self.metric_param.get('sample_weight', None)
            }
        elif self.metric_type == 'f1_score':
            dict_params = {'labels': self.metric_param.get('labels', None),
                           'pos_label': self.metric_param.get('pos_label', 1),
                           'average': self.metric_param.get('average', 'binary'),
                           'sample_weight': self.metric_param.get('sample_weight', None),
                           'zero_division': self.metric_param.get('zero_divisiont', 'warn'),
            }
        elif self.metric_type == 'balanced_accuracy_score':
            dict_params = {'sample_weigh': self.metric_param.get('sample_weigh', None),
                           'adjusted': self.metric_param.get('adjusted', False)
            }
        elif self.metric_type == 'precision_score':
            dict_params = {'labels': self.metric_param.get('labels', None),
                           'pos_label': self.metric_param.get('pos_label', 1),
                           'average': self.metric_param.get('average', 'binary'),
                           'sample_weight': self.metric_param.get('sample_weight', None),
                           'zero_division': self.metric_param.get('zero_division', 'warn')
            }
        elif self.metric_type == 'recall_score':
            dict_params = {'labels': self.metric_param.get('labels', None),
                           'pos_label': self.metric_param.get('pos_label', 1),
                           'average': self.metric_param.get('average', 'binary'),
                           'sample_weight': self.metric_param.get('sample_weight', None),
                           'zero_division': self.metric_param.get('zero_division', 'warn')
            }
        elif self.metric_type == 'roc_auc_score':
            dict_params = {'average': self.metric_param.get('average', 'macro'),
                           'sample_weight': self.metric_param.get('sample_weight', None),
                           'max_fpr': self.metric_param.get('max_fpr', None),
                           'multi_class': self.metric_param.get('multi_class', 'raise'),
                           'labels': self.metric_param.get('labels', None),
            }
        else:
            raise Exception('Without a known metric_type parameter we cannot determine the other auxiliary parameters')
            
        return dict_params


    def _parallel_predict(self, est, X, check_input=True):
        """
        @Todo Validar 'est' para verificar se é um classificador e se tem o
        método predict
        """
        y_pred = Parallel(n_jobs=self._get_n_jobs(),
                      verbose=self.verbose, 
                      require='sharedmem'
        )(
            delayed(est.predict)(Xi, check_input) 
            for Xi in np.array_split(X, self._get_n_jobs())
        )

        return np.ravel(y_pred)
    
    def _metric_evaluable(self, index, est, X, y_true, check_input=True):
        """
            Return scored for X from metric with y_true
        """
        y_pred = self._parallel_predict(est, X, check_input)
        score = self.metric(y_true, y_pred, **self.args_metric)

        return (index, score)
    
    def fit(self, X, y, check_input=True):
        """
            X -> list of features values
            y -> label for each X(i)
        """
        if len(self.index_k_tree) != 0:
              raise Exception("Classifier is fitted!")
    
        if self.strategy == 'score':
            self._fit_score(X, y, check_input)
        elif self.strategy == 'auto':
            pass
        else:
            raise('Unknow strategy!')

        return self
    
    def _fit_score(self, X_true, y_true, check_input=True):
        """
            X_true -> 
            y_true ->
        """
        self.metric = self._get_metric()
        self.args_metric = self._get_metric_params()
        
        lst_scores = Parallel(n_jobs=self.n_jobs,
                              verbose=self.verbose, 
                              require='sharedmem'
        )(
            delayed(self._metric_evaluable)(xi, est, X_true, y_true, check_input) 
            for xi, est in enumerate(self.ensemble.estimators_)
        )
        
        ordered_scores = sorted(lst_scores,
                                key=lambda position: position[1], reverse=True)

        self.index_k_tree = np.array([ik[0] for ik in ordered_scores[:self.n_tree]],
                                     dtype=np.int64)

        self.fit_scores_ = np.array([ik[1] for ik in ordered_scores[:self.n_tree]],
                                    dtype=np.float64)
        
        del lst_scores, ordered_scores
    
    def fit_update(self, X, y, check_input=True):
        """
            X -> list of features values
            y -> label for each X(i)
        """
        if self.index_k_tree.sum() == 0:
            raise Exception("Classifier isn't fitted")
        
        
        if self.strategy == 'score':
            self._fit_score(X, y, check_input)
        elif self.strategy == 'auto':
            pass
        else:
            raise('Unknow strategy!')
    
        return self
    
    def _fit_auto(self):
        pass
    
    def predict(self, X, check_input=True):
        """
            X -> unknow instances
        """
        if len(self.index_k_tree) == 0:
              raise Exception("Classifier not is fitted!")
                
        if self.voting == "majority":
            return self._predict_majority(X, check_input)
        else:
            raise Exception('Method of voting required')
    
    
    def _predict_majority(self, X, check_input=True):
        """
            X -> unknow instances
        """
        predictions = Parallel(n_jobs=self.n_jobs,
                               verbose=self.verbose,
                               require='sharedmem'
        )(
          delayed(self._parallel_predict)(self.ensemble.estimators_[Ei], X, check_input)
          for Ei in self.index_k_tree
        )
      
        predictions = np.array(predictions, dtype=np.int64)
        predictions = predictions.T
          
        maj = np.apply_along_axis(lambda x: np.bincount(x).argmax(), 
                                  axis=1, arr=predictions)
        
        del predictions
        return maj
    
    def score(self, X, y):
        """
            @todo implement sample_weight parameter
        """
        pass
    
    
    def predict_proba(self):
        """
            None
        """
        pass
        
    
    def learn_one(self):
        """
            A wrapper for learning compatible with the river plataform, but 
            to using lazy classifiers (i. e. sklearn implementation)
        """
        pass
    
    
    def predict_one(self):
        """
            A wapprer for predict compatible with the river plataform, but to
            using lazy classifiers (i. e. sklearn implementation)
        """
        pass

In [5]:
X_train = X[:1000].values
y_train = y[:1000].values
X_test = X[1000:2000].values
y_test = y[1000:2000].values

clf_rf = RandomForestClassifier(n_estimators=100,
                                criterion='entropy',
                                max_features='auto',
                                bootstrap=True,
                                oob_score=True,
                                random_state=42,
                                warm_start=True,
                                n_jobs=-1
)
clf_rf = clf_rf.fit(X_train, y_train)
print("\nRandom Forest (with 100 trees) trained.")
print('Score within of training data = ', clf_rf.score(X_train, y_train))


Random Forest (with 100 trees) trained.
Score within of training data =  1.0


In [6]:
# Exceuta o k tree 
ktree = KTreeClassifier(clf_rf, strategy="score")
ktree = ktree.fit(X_train, y_train)

In [7]:
y_pred = ktree.predict(X_test)

In [8]:
y_pred.shape

(1000,)

In [9]:
ktree.get_score_train()

0.9709877542212871

In [10]:
ktree.get_score_train(True)

(0.9709877542212871, 0.0031668865415189597)

In [11]:
y_pred[:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0], dtype=int64)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score, f1_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score



def _get_metric(metric_type):
    """
        Decode the metric function to use for evaluable classifiers
    """
    if metric_type == 'accuracy_score':
        return accuracy_score
    elif metric_type == 'cohen_kappa_score':
        return cohen_kappa_score
    elif metric_type == 'f1_score':
        return f1_score
    elif metric_type == 'balanced_accuracy_score':
        return balanced_accuracy_score
    elif metric_type == 'precision_score':
        return precision_score
    elif metric_type == 'recall_score':
        return recall_score
    elif metric_type == 'roc_auc_score':
        return roc_auc_score
    else:
        raise Exception('The parameter metric_type can not empty or unknow')
        
        
metric = _get_metric('roc_auc_score')
print(metric)

In [None]:
random_seed_4 = 42

est = DecisionTreeClassifier(criterion='entropy',
                                max_features='auto',
                                random_state=random_seed_4
)
est = est.fit(X[:1000], y[:1000])
print("\nDecision Tree (Single) trained.")
print('Score within of training data = ', est.score(X[:1000], y[:1000]))

In [None]:
from joblib import Parallel, delayed
from os import cpu_count

n_jobs = -1
verbose = 0
check_input = True
sample_weight = None
ci = check_input

def _get_n_jobs():
  return (cpu_count() if n_jobs == -1 else n_jobs)

y_pred = Parallel(n_jobs=_get_n_jobs(),
                          verbose=verbose, 
                          require='sharedmem'
        )(delayed(est.predict)(Xi, ci) for Xi in np.array_split(X[1900:2000], _get_n_jobs()))

y_pred = np.ravel(y_pred)
#np.array_split(X[1000:2000], _get_n_jobs())
print(y_pred.shape)
print(y_pred[:10])

In [None]:
#accuracy_score(y[1900:2000], y_pred)
metric = roc_auc_score
dp = {'average': 'weighted', 'sample_weight': None, 'max_fpr': None, 'multi_class': 'ovr', 'labels': None}
roc_auc_score(y[1900:2000], y_pred, **dp)

In [None]:
fig, axs = plt.subplots(figsize=(12, 20))
plot_tree(clf_dt, max_depth=2, ax=axs)
plt.show();

In [None]:
clf_dt = clf_dt.fit(X[1000:2000], y[1000:2000])
print("\nDecision Tree (Single) trained.")
print('Score within of training data = ', clf_dt.score(X[:1000], y[:1000]))
print('Score within of training data = ', clf_dt.score(X[1000:2000], y[1000:2000]))
print('Score within of training data = ', clf_dt.score(X[2000:3000], y[2000:3000]))

In [None]:
fig, axs = plt.subplots(figsize=(12, 20))
plot_tree(clf_dt, max_depth=2, ax=axs)
plt.show();