**Implementation of a Weighted Ensemble Classifier**

```
Implementation of a Weighted Ensemble Classifier.
The implementation follows the techniques described in
"Mining Concept-Drifting Data Streams using Ensemble Classifiers", by
Haixun Wang, Wei Fan, Philip S. Yu, Jiawei Han
```




In [0]:
#Useful Imports
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import operator
import copy as cp
import sortedcontainers as sc
from sklearn.model_selection import StratifiedKFold

In [0]:
#Class WeightedEnsembleClassifier

class WeightedEnsembleClassifier:
                                                            #k: maximum number of classifiers in the ensemble
                                                            #S: chunk size
                                                            #base learner: base astimator
                                                            #CV: number of folds to compute the score of a newly added classifier

    class WeightedClassifier:                             #An inner class to control weights and additional information of a base learner in the ensemble
     #Create a new weighted classifier
         
        def __init__(self, clf, weight, chunk_labels):                   
            self.clf = clf                                  
            self.weight = weight                            # the weight associated to this classifier
            self.chunk_labels = chunk_labels                # the unique labels of the data chunk the classifier(clf) is trained on

        def __lt__(self, other):                            # Compares an object of this class to the other by means of the weight
            return self.weight < other.weight               # for sorting the classifier in sorted list and returns true (smaller weight than other weight)

    def __init__(self, K=10, base_learner=DecisionTreeClassifier(), S=200, cv=5):           #new ensemble
                                                                                            #K: the maximum number of classifier in this ensemble
                                                                                            #base_learner: the base learner, other classifiers will be a deep-copy of the base learner
                                                                                            #S: the chunk size
                                                                                            #cv: the number of folds for cross-validation
        self.K = K                                           # top K classifiers
        self.base_learner = base_learner                     # base learner
        self.models = sc.SortedList()                        # a sorted list if classifiers        
        self.cv = cv                                          # cross validation fold
                                                           
        # chunk-related information
          
        self.S = S                          # chunk size
        self.p = -1                         # chunk pointer
        self.X_chunk = None
        self.y_chunk = None

    def partial_fit(self, X, y=None, classes=None, weight=None):                     #Updates the ensemble when a new data chunk arrives
                                                                                    #X: the training data
                                                                                     #y: the training labels
                                                                                      #classes:contains all possible labels
                                                                                      #weight: array-like, instance weight, uniform weights are assumed if not provided
        
        N, D = X.shape

        # initializes when the ensemble is first called
        if self.p == -1:
            self.X_chunk = np.zeros((self.S, D))
            self.y_chunk = np.zeros(self.S)
            self.p = 0

        # fill up the data chunk
        for i, x in enumerate(X):
            self.X_chunk[self.p] = X[i]
            self.y_chunk[self.p] = y[i]
            self.p += 1

            if self.p == self.S:
                self.p = 0                                                # reset the pointer (S is chunk size)

                                                                          # retrieve the classes and class count
                if classes is None:
                    classes, class_count = np.unique(self.y_chunk, return_counts=True)
                else:
                    _, class_count = np.unique(self.y_chunk, return_counts=True)
                C_new = cp.deepcopy(self.base_learner)                    # 1: train classifier C' from X by creating a deep copy from the base learner
                try:
                    C_new.fit(self.X_chunk, self.y_chunk)
                except NotImplementedError:
                    C_new.partial_fit(self.X_chunk, self.y_chunk, classes, weight)
                
              # compute the baseline error rate given by a random classifier
                baseline_score = self.compute_random_baseline(classes)

              # compute the weight of C', may do cross-validation if cv is not None
                clf_new = self.WeightedClassifier(clf=C_new, weight=0, chunk_labels=classes)
                clf_new.weight = self.compute_weight(model=clf_new, random_score=baseline_score, cv=self.cv)

                for model in self.models:                                 # 4: update the weights of each classifier in the ensemble
                    model.weights = self.compute_weight(model=model, random_score=baseline_score, cv=None)

                if len(self.models) < self.K:                               # 5: C <- top K weighted classifiers in C U { C' }
                    self.models.add(value=clf_new)
                else:
                    if clf_new.weight > 0 and clf_new.weight > self.models[0].weight:
                        self.models.pop(0)
                        self.models.add(value=clf_new)
              
        return self
        pass  

    def predict(self, X):                                                 #Predicts the labels of X in a general classification setting
                                                                          # The prediction is done via normalized weighted voting (choosing the maximum)
        N, D = X.shape                                                    #X: the unseen data to give predictions
                                                                          #return: a list of shape (n-samples,) containing predictions
        # List with size X.shape[0] and each value is a dict too,
        # Ex: [{0:0.2, 1:0.7}, {1:0.3, 2:0.5}]
        list_label_instance = []

        # use sum_weights for normalization
        sum_weights = np.sum([clf.weight for clf in self.models])

        # For each classifier in self.models, predict the labels for X
        for model in self.models:
            clf = model.clf
            pred = clf.predict(X)
            weight = model.weight
            for i, label in enumerate(pred.tolist()):
                if i == len(list_label_instance):  # maintain the dictionary
                    list_label_instance.append({label: weight / sum_weights})
                else:
                    try:
                        list_label_instance[i][label] += weight / sum_weights
                    except KeyError:
                        list_label_instance[i][label] = weight / sum_weights

        predict_weighted_voting = np.zeros(N)
        for i, dic in enumerate(list_label_instance):
            # return the key of max value in a dict
            max_value = max(dic.items(), key=operator.itemgetter(1))[0]
            predict_weighted_voting[i] = max_value

        return predict_weighted_voting

    def compute_score(self, model, X, y):                         #This compute the mean square error of a classifier. This code needs to take into account the fact that a classifier C trained on a
                                                                  #previous data chunk may not have seen all the labels that appear in a new chunk
                                                                  #(e.g. C is trained with only labels [1, 2] but the new chunk contains labels [1, 2, 3, 4, 5]
                                                                  #X: data of the new chunk;y: labels of the new chunk;return: the mean square error MSE_i
        N = len(y)
        labels = model.chunk_labels
        probabs = model.clf.predict_proba(X)
        sum_error = 0
        for i, c in enumerate(y):
            # if the label in y is unseen when training, skip it, don't include it in the error
            if c in labels:
                index_label_c = np.where(labels == c)[0][0]  # find the index of this label c in probabs[i]
                probab_ic = probabs[i][index_label_c]
                sum_error += (1 - probab_ic) ** 2
            else:
                sum_error += 1
        return sum_error / N

    def compute_score_crossvalidation(self, model, cv):                      # Computes the score of interests, using cross-validation         
                                                                            #model: the model to compute the score on
                                                                           #cv: the number of folds, if None, the score is computed directly on the entire data chunk,
        
        if cv is not None and type(cv) is int:
            # here I create a copy because here I don't want to "modify" an already trained model
            copy_model = cp.deepcopy(model)
            score = 0
            sf = StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=0)
            for train_idx, test_idx in sf.split(X=self.X_chunk, y=self.y_chunk):
                X_train, y_train = self.X_chunk[train_idx], self.y_chunk[train_idx]
                X_test, y_test = self.X_chunk[test_idx], self.y_chunk[test_idx]
                try:
                    copy_model.clf.fit(X_train, y_train)
                except NotImplementedError:
                    copy_model.clf.partial_fit(X_train, y_train, copy_model.chunk_labels, None)
                score += self.compute_score(model=copy_model, X=X_test, y=y_test) / self.cv
        else:
            # compute the score on the entire data chunk
            score = self.compute_score(X=self.X_chunk, y=self.y_chunk, model=model)

        return score

    def compute_weight(self, model, random_score, cv=None):                       #Compute the weight of a classifier given the random score (calculated on a random learner).
                       
        # compute MSE, with cross-validation or not
        score = self.compute_score_crossvalidation(model=model, cv=cv)

        # w = MSE_r = MSE_i
        return random_score - score

    def compute_random_baseline(self, classes):                                   #for the random baseline score(score produced by a random classifier)
                                                                                   # The random score is MSE_r
        # L = len(np.unique(classes))
        # MSE_r = L * (1 / L) * (1 - 1 / L) ** 2
        #base on the class distribution of the data --> count the number of labels
        _, class_count = np.unique(classes, return_counts=True)
        class_dist = [class_count[i] / self.S for i, c in enumerate(classes)]
        mse_r = np.sum([class_dist[i] * ((1 - class_dist[i]) ** 2) for i, c in enumerate(classes)])
        return mse_r

In [36]:
#Usefull imports 
!pip install -U scikit-multiflow
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential
from skmultiflow.trees import HoeffdingTree
from skmultiflow.data import FileStream
import matplotlib as plt

Requirement already up-to-date: scikit-multiflow in /usr/local/lib/python3.6/dist-packages (0.4.1)


In [37]:
#Evaluation of Weighted Ensemble Classifier 

# prepare the stream
stream = FileStream('elec.csv',n_targets=1, target_idx=-1)
stream.prepare_for_use()

# instantiate a classifier
clf = WeightedEnsembleClassifier(K=250, base_learner=HoeffdingTree(), cv=None)

# setup the evaluator
evaluator = EvaluatePrequential(pretrain_size=1000, max_samples=2000, show_plot=False,
                                metrics=['accuracy'], batch_size=10)

# 4. Run
evaluator.evaluate(stream=stream, model=clf, model_names=["Weighted Ensemble Classifier "])

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 1000 sample(s).
Evaluating...
 #################### [100%] [0.90s]
Processed samples: 2000
Mean performance:
Weighted Ensemble Classifier  - Accuracy     : 0.7860


[<__main__.WeightedEnsembleClassifier at 0x7fbd7f608588>]

Sources: greeks of greek, youtube, Github https://github.com/topics/data-stream