In [1]:
# 1.- Libraries import

import pandas as pd
import numpy as np
from skmultiflow.trees import HoeffdingTree
from skmultiflow.evaluation import EvaluatePrequential
import time
from sklearn import metrics

In [2]:
# 2.- Data reading, preprocessing and parameters setting

data = pd.read_csv("cs-training.csv") # Give me some credit data, drop na for easier use
# data = pd.read_csv("heart.csv", sep = ";", decimal = ",") # Give me some credit data, drop na for easier use
data = data.dropna()

L = 2                                 # Number of instances
I = 500                              # Size of the data block
D = 30                                # Number of dynamic classifiers
eps = 0.25                            # Instance selection ratio

In [4]:
data.shape

(120269, 12)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


We define 5 functions, most of them as detailed in the paper https://ieeexplore.ieee.org/abstract/document/8706959:

 - **Ensemble**: Main function, as detailed in the paper
 - **CreateNewBaseClassfier**: As detailed in the paper
 - **ReinformentAdjustment**: As detailed in the paper
 - **TrainOnInstance**: As detailed in the paper
 - **predict**: Used for prediction of new values

In [12]:
def ensemble(S,L,I,D,eps):
    
    # 1.- INITIALIZATION
    
    labels = np.asarray(S.iloc[:,1])         # Labels    
    S = np.asarray(S.iloc[:,2:S.shape[1]])   # Data

#     S = np.asarray(S.iloc[:,0:data.shape[1]-1])
#     labels = np.asarray(data.iloc[:,-1]) 
    B = []                                      # Circular array
    U = [[] for i in range(0,L)]                # Resampling buffer
    DCIR = [1/L for i in range(0,L)]            # Class imbalance ratio
    C = []                                      # Classifiers vector
    H = [[0 for i in range(0,D)] for i in range(0,L)]  # Nº of instances of different classes
    ws = 0.5                                    # Static weight
    wd = [1/D for i in range(0,D-1)]            # Dynamic weights

    p = 0                                       # Processed instances counter
    i = 0                                       # Position of circular array
    k = 0                                       # Indicator of dynamic classifiers
    kc = 0                                      # Circular indicator of dynamic classifiers
    
    # 2.- PROCESS
    
    for j in range(0, len(S)):
        
        xnew = S[j]
        p = p + 1
        if (p%10000 == 0):
            print("Instances through stream: ", p)
        
        # 2.1.- FIRST INSTANCES
        if (p < I):
            B.append(xnew)
        elif (p == I):
            B.append(xnew)
            [Cnew, H, U, ws, wd] = CreateNewBaseClassfier(eps, B, U, I, L, D, C, H, wd, ws, k, kc, DCIR, labels)
            C.append(Cnew)
            k = 1
            kc = 1
        
        # 2.2.- MIDDLE INSTANCES
        else:
            i = (p-1)%I
#             print(p-I-1)
#             print(DCIR)
#             print(labels[p-I-1])
            [B, C, ws, wd] = TrainOnInstance(xnew, i, L, D, B, C, wd, ws, DCIR[labels[p-I-1]], labels[p-I-1])
            i = (i + 1)%I
            if (i == 0):
                k = k+1
                kc = k%D
                [Ck, H, U, ws, wd] = CreateNewBaseClassfier(eps, B, U, I, L, D, C, H, wd, ws, k, kc, DCIR, labels)
                if k > D:
                    sol = predict(S[j-I:j,:], C, ws, wd)
#                     print(labels[j-I:j])
#                     print(sol)
#                     print("Different from 0:",np.sum(sol!=0))
#                     print("Accuracy in ", j, ": ", np.sum(np.equal(sol, labels[j-I:j]))/I)
                    ws = 1/2
                    wd[0:D-2] = list(map(lambda x: x * (1-1/D), wd[1:D-1]))
                    wd[-1] = 1/D
                else:
                    C.append(Ck)
                    ws = 1/2
                    wd[0:k-2] = list(map(lambda x: x * (1-1/D), wd[0:k-2]))
                    wd[0] = 1/D

                if k < D:
                    l = k
                else:
                    l = D
                temp = [np.asarray(H[i][1:l+1]) * np.asarray(wd[0:l]) for i in range(0,L)]
                for i in range(0,L):
                    DCIR[i] = np.sum(temp[i])/np.sum(temp)
                    
    # 2.3.- FINAL INSTANCES
    for j in range(0,I-1):
        xnew = B[j]
        [B, C, ws, wd] = TrainOnInstance(xnew, i, L, D, B, C, wd, ws, DCIR[labels[j-I]], labels[len(labels)-I+j])
    
    return [C,ws,wd]

In [13]:
def CreateNewBaseClassfier(eps, B, U, I, L, D, C, H, wd, ws, k, kc, DCIR, labels):
    
    # 1.- Select top eps*I instances
    Rn = B[0:int(I*eps)]
    Rlabels = labels[0:int(I*eps)]
    
    # 2.- In case is not the first classifier, perform weight adjustment
    if k != 0:
        for i in range(0, len(Rn)):
            [ws,wd] = ReinformentAdjustment(np.asarray([Rn[i]]), L, D, C, wd, ws, DCIR[Rlabels[i]],Rlabels[i])
    
    # 3.- Calculate instance numbers
    for i in range(0,L):
        H[i][kc] = len(np.where(Rlabels[0:int(I*eps)]==i)[0])
    
    # 4.- Create new classifier
    Cnew = HoeffdingTree()
    
    # 5.- In case is not the first classifier, train in previous instances for balance
    if k != 0:
        if (H[0][kc] < int(I*eps/L)) and (len(U[0])>0):
            U0 = np.array(U[0])
            if len(U[0]) >= (int(I*eps/L) - H[0][kc]):
                Cnew.fit(U0[-(int(I*eps/L) - H[1][kc]):], np.asarray([0 for i in range(0, (int(I*eps/L) - H[0][kc]))]))
            else:
                Cnew.fit(U0, np.asarray([0 for i in range(0, len(U[0]))]))
        elif (H[1][kc] < int(I*eps/L)) and (len(U[1])>0):
            U1 = np.array(U[1])
            
            if len(U[1]) >= (int(I*eps/L) - H[1][kc]):
                Cnew.fit(U1[-(int(I*eps/L) - H[1][kc]):], np.asarray([1 for i in range(0, (int(I*eps/L) - H[1][kc]))]))
            else:
                Cnew.fit(U1, np.asarray([1 for i in range(0, len(U[1]))]))
        Cnew.partial_fit(Rn,Rlabels)
            
    else:
        # 6.- Train the new classifier
#         print(Rn)
        Cnew.fit(np.array(Rn), np.array(Rlabels))
    
    # 7.- Adapt the resampling buffer
    for i in range(0,len(Rn)):
        if Rlabels[i] == 0:
            U[0].append(Rn[i])
            if len(U[0])> int(I*eps/L):
                U[0].pop()
        else:            
            U[1].append(Rn[i])
            if len(U[1])> int(I*eps/L):
                U[1].pop()
    
    return [Cnew, H, U, ws, wd]

In [14]:
def  ReinformentAdjustment(x, L, D, C, wd, ws, DCIR, label):
    if (DCIR < 1/L):
        for d in range(1,len(C)-1):
            if (C[d].predict(x)[0] == label):
#                 print("HEY!!")
#                 print(label)
                wd[d] = wd[d]*(1+1/D)
#                 print(wd)
            else:
                wd[d] = wd[d]*(1-1/D)
        if (C[0].predict(x)[0] == label):
#             print("HEY!!")
            ws = ws*(1+1/D)
        else:
            ws = ws*(1-1/D)   
    return [ws,wd]

In [15]:
def TrainOnInstance(xnew, i, L, D, B, C, wd, ws, DCIR, label):
    x = np.asarray([B[i]])
    [ws, wd] = ReinformentAdjustment(x, L, D, C, wd, ws, DCIR, label)
    for j in range(0,len(C)):
        if label == 1:
            C[j].partial_fit(x, [label])
        elif (label == 0) and (np.random.rand(1)<0.15):
            C[j].partial_fit(x, [label])
    B[i] = xnew
    return [B, C, ws, wd]

In [16]:
def predict(data, C, ws, wd):
    pred = C[0].predict(np.asarray(data))*ws
#     print(wd)
    for i in range(1, len(C)-1):
#         print("Classifier: ", i)
        pred = pred + C[i].predict(np.asarray(data))*wd[i-1]
    for i in range(0, len(pred)):
        if pred[i] >= 0.5:
            pred[i] = 1
        else:
            pred[i] = 0
    return pred

In [17]:
# 4.- Test and metrics
start_time = time.time()
ret = ensemble(data,L,I,D,eps)
end_time = time.time()
print("Total time: ", end_time - start_time)

Instances through stream:  10000
Instances through stream:  20000
Instances through stream:  30000
Instances through stream:  40000
Instances through stream:  50000
Instances through stream:  60000
Instances through stream:  70000
Instances through stream:  80000
Instances through stream:  90000
Instances through stream:  100000
Instances through stream:  110000
Instances through stream:  120000
Total time:  258.77772665023804


In [18]:
sol = predict(np.asarray(data.iloc[:,2:]), ret[0], ret[1], ret[2])

In [19]:
sol

array([1., 0., 1., ..., 0., 0., 0.])

In [21]:
np.sum(np.equal(sol, np.asarray(data.iloc[:,1])))/len(data.iloc[:,2:])

0.885016088933973

In [27]:
fpr, tpr, thresholds = metrics.roc_curve(np.asarray(data.iloc[:,1]), sol)

In [28]:
metrics.auc(fpr, tpr)

0.7290561292098146