In [1]:
import util
import warnings
import numpy as np

from ld3 import LD3, Window, StreamGenerator

from tqdm.notebook import tqdm

from skmultilearn.dataset import load_from_arff

from skmultiflow.data import DataStream
from skmultiflow.meta import ClassifierChain
from skmultiflow.drift_detection import ADWIN, EDDM, KSWIN, HDDM_W, HDDM_A, DDM

from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer

from tornados.drift_detection.rddm import RDDM
from tornados.drift_detection.fhddm import FHDDM
from tornados.drift_detection.fhddms import FHDDMS
from tornados.drift_detection.mddm_a import MDDM_A
from tornados.drift_detection.mddm_e import MDDM_E
from tornados.drift_detection.mddm_g import MDDM_G
from tornados.drift_detection.fhddms_add import FHDDMS_add
from tornados.drift_detection.seq_drift2 import SeqDrift2ChangeDetector

warnings.filterwarnings('ignore')
np.set_printoptions(precision=4, suppress=True)

In [8]:
# Real Dataset Load
X, y = load_from_arff('./datasets/{}'.format('20NG.arff'), label_count=20)
X = X.toarray()
y = y.toarray().astype(np.int8)
sample_size = len(X)
n_features = X.shape[1]

if len(np.unique(y)) > 2:
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(y)
n_targets = y.shape[1]
stream = DataStream(data = X, y=y, n_targets=n_targets)

# Synthetic Dataset Generation
#gen = StreamGenerator()
#stream, sample_size, n_features, n_targets = gen.get_stream(type='sudden2')

In [12]:
detector = LD3(window_size=1000, correlation_thresh=5, len=3)
clf = ClassifierChain(GaussianNB())
pre_sample = [np.zeros(n_features), np.zeros(n_features)]
pre_label = [np.ones(n_targets), np.zeros(n_targets)]
clf.fit(np.array(pre_sample), np.array(pre_label))


stream.restart()
max_samples = sample_size
y_true = []
y_pred = []
pretrain_X = []
pretrain_y = []
p_bar = tqdm(total=max_samples)
n_samples = 0
drift, warning = False, False
correlation = 0
while n_samples < max_samples and stream.has_more_samples():
    X, y = stream.next_sample()
    if drift:
        print('Drift@', n_samples)
        r1, r2 = detector._ranks
        clf.reset()
        clf.fit(np.array(pre_sample), np.array(pre_label))

    pred = clf.predict(X)
    #clf.partial_fit(X, np.array([y])) # For synthetic datasets
    clf.partial_fit(X, y) # For real datasets
    drift, warning, correlation, score = detector.update(pred.astype(np.int32))
    
    #y_true.append(y) # For synthetic datasets
    y_true.extend(y) # For real datasets
    y_pred.extend(pred)

    p_bar.update(1)
    n_samples += 1
p_bar.close()
stream.restart()

print()
print(np.round(util.accuracy_example(np.array([y_true]), np.array([y_pred])), decimals=4))
print(1-np.round(util.hamming_loss(np.array(y_true), np.array(y_pred)), decimals=4))
print(np.round(f1_score(np.array(y_true), np.array(y_pred), average='samples'),decimals=4))
print(np.round(util.f1_micro(np.array(y_true), np.array(y_pred)), decimals=4))
print()

  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 5027
Drift@ 8090
Drift@ 15583

0.1616
0.8578
0.3173
0.2783



In [10]:
# Tests for detectors in Scikit-Multiflow Framework
detectors = [ADWIN(), EDDM(), DDM(), KSWIN(), HDDM_A(), HDDM_W()]
for detector in detectors:
    clf = ClassifierChain(GaussianNB())
    pre_sample = [np.zeros(n_features), np.zeros(n_features)]
    pre_label = [np.ones(n_targets), np.zeros(n_targets)]
    clf.fit(np.array(pre_sample), np.array(pre_label))

    stream.restart()
    max_samples = sample_size
    y_true = []
    y_pred = []
    pretrain_X = []
    pretrain_y = []
    p_bar = tqdm(total=max_samples)
    n_samples = 0
    w_x = Window(max_size=199)
    w_y = Window(max_size=199)
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample()
        if detector.detected_change():
            print('Drift@', n_samples)
            detector.reset()
            clf.reset()
            clf.fit(np.array(pre_sample), np.array(pre_label))

        pred = clf.predict(X)
        #clf.partial_fit(X, np.array([y])) # For synthetic datasets
        clf.partial_fit(X, y) # For real datasets
        detector.add_element((pred.astype(np.int32).flatten().tolist())==(y.astype(np.int32).flatten().tolist()))

        #y_true.append(y) # For synthetic datasets
        y_true.extend(y) # For real datasets
        y_pred.extend(pred)

        p_bar.update(1)
        n_samples += 1
    
    print()
    print(np.round(util.accuracy_example(np.array([y_true]), np.array([y_pred])), decimals=4)) 
    print(np.round(util.hamming_loss(np.array(y_true), np.array(y_pred)), decimals=4))
    print(np.round(f1_score(np.array(y_true), np.array(y_pred), average='samples'),decimals=4))
    print(np.round(util.f1_micro(np.array(y_true), np.array(y_pred)), decimals=4))
    print()
   
    stream.restart()

  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2815
Drift@ 3967
Drift@ 9599
Drift@ 15231
Drift@ 17279
Drift@ 17855
Drift@ 18111
Drift@ 18495
Drift@ 19295

0.1386
0.1574
0.2999
0.2435
0.2368



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 211
Drift@ 513
Drift@ 786
Drift@ 1089
Drift@ 1382
Drift@ 1701
Drift@ 2002
Drift@ 2181
Drift@ 2451
Drift@ 2858
Drift@ 3154
Drift@ 3443
Drift@ 3779
Drift@ 4030
Drift@ 4342
Drift@ 4699
Drift@ 5019
Drift@ 5263
Drift@ 5523
Drift@ 5864
Drift@ 6161
Drift@ 6496
Drift@ 6745
Drift@ 7034
Drift@ 7340
Drift@ 7603
Drift@ 7828
Drift@ 8095
Drift@ 8408
Drift@ 8680
Drift@ 9042
Drift@ 9358
Drift@ 9678
Drift@ 10063
Drift@ 10430
Drift@ 10746
Drift@ 11118
Drift@ 11420
Drift@ 11725
Drift@ 12048
Drift@ 12324
Drift@ 12648
Drift@ 12915
Drift@ 13223
Drift@ 13555
Drift@ 13916
Drift@ 14280
Drift@ 14591
Drift@ 14811
Drift@ 15093
Drift@ 15343
Drift@ 15615
Drift@ 15952
Drift@ 16163
Drift@ 16471
Drift@ 16769
Drift@ 17049
Drift@ 17276
Drift@ 17529
Drift@ 17769
Drift@ 17996
Drift@ 18140
Drift@ 18319
Drift@ 18489
Drift@ 18663
Drift@ 18868
Drift@ 19130
Drift@ 19297

0.0654
0.4514
0.1932
0.1227
0.1168



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 34
Drift@ 813
Drift@ 847
Drift@ 921
Drift@ 1004
Drift@ 1041
Drift@ 1114
Drift@ 1209
Drift@ 1261
Drift@ 1362
Drift@ 1499
Drift@ 1629
Drift@ 1718
Drift@ 1762
Drift@ 1834
Drift@ 1875
Drift@ 1936
Drift@ 1972
Drift@ 2013
Drift@ 2066
Drift@ 2155
Drift@ 2264
Drift@ 2411
Drift@ 2501
Drift@ 2563
Drift@ 2623
Drift@ 2660
Drift@ 2729
Drift@ 2765
Drift@ 2829
Drift@ 3009
Drift@ 3042
Drift@ 3133
Drift@ 3237
Drift@ 3309
Drift@ 3392
Drift@ 3476
Drift@ 3535
Drift@ 3642
Drift@ 3723
Drift@ 3779
Drift@ 3870
Drift@ 3911
Drift@ 3969
Drift@ 4040
Drift@ 4106
Drift@ 4169
Drift@ 4247
Drift@ 4360
Drift@ 4434
Drift@ 4480
Drift@ 4567
Drift@ 4641
Drift@ 4713
Drift@ 4802
Drift@ 4878
Drift@ 4985
Drift@ 5044
Drift@ 5095
Drift@ 5209
Drift@ 5249
Drift@ 5335
Drift@ 5402
Drift@ 5484
Drift@ 5577
Drift@ 5615
Drift@ 5762
Drift@ 5804
Drift@ 5839
Drift@ 5889
Drift@ 5965
Drift@ 6039
Drift@ 6138
Drift@ 6195
Drift@ 6265
Drift@ 6358
Drift@ 6452
Drift@ 6502
Drift@ 6543
Drift@ 6611
Drift@ 6662
Drift@ 6731
Drift@ 6804
Drift@ 68

  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 1381
Drift@ 3866
Drift@ 5416
Drift@ 5670
Drift@ 7695
Drift@ 9961

0.137
0.1523
0.2743
0.241
0.2359



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 42
Drift@ 1111

0.1505
0.1931
0.3145
0.2616
0.2652



  0%|          | 0/19300 [00:00<?, ?it/s]


0.1469
0.2047
0.3099
0.2561
0.2676



In [11]:
# Tests for detectors in Tornado Framework
detectors = [FHDDM(), FHDDMS(), FHDDMS_add(), MDDM_A(), MDDM_E(), MDDM_G(), SeqDrift2ChangeDetector(), RDDM()]
for detector in detectors:
    clf = ClassifierChain(GaussianNB())
    pre_sample = [np.zeros(n_features), np.zeros(n_features)]
    pre_label = [np.ones(n_targets), np.zeros(n_targets)]
    clf.fit(np.array(pre_sample), np.array(pre_label))


    stream.restart()
    max_samples = sample_size
    y_true = []
    y_pred = []
    pretrain_X = []
    pretrain_y = []
    p_bar = tqdm(total=max_samples)
    n_samples = 0
    drift = False
    w_x = Window(max_size=199)
    w_y = Window(max_size=199)
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample()
        if drift:
            print('Drift@', n_samples)
            detector.reset()
            clf.reset()
            clf.fit(np.array(pre_sample), np.array(pre_label))

        pred = clf.predict(X)
        #clf.partial_fit(X, np.array([y])) # For synthetic datasets
        clf.partial_fit(X, y) # For real datasets
        _, drift = detector.run((pred.astype(np.int32).flatten().tolist())==(y.astype(np.int32).flatten().tolist()))

        #y_true.append(y) # For synthetic datasets
        y_true.extend(y) # For real datasets
        y_pred.extend(pred)

        p_bar.update(1)
        n_samples += 1
    print()
    print(np.round(util.accuracy_example(np.array([y_true]), np.array([y_pred])), decimals=4)) 
    print(np.round(util.hamming_loss(np.array(y_true), np.array(y_pred)), decimals=4))
    print(np.round(f1_score(np.array(y_true), np.array(y_pred), average='samples'),decimals=4))
    print(np.round(util.f1_micro(np.array(y_true), np.array(y_pred)), decimals=4))
    print()
    
    stream.restart()

  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2043
Drift@ 8284
Drift@ 9983
Drift@ 14397
Drift@ 18411
Drift@ 18964

0.1453
0.145
0.2965
0.2537
0.2471



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2043
Drift@ 5304
Drift@ 10007
Drift@ 14397
Drift@ 18408
Drift@ 18632
Drift@ 19096

0.1444
0.1401
0.2915
0.2523
0.2448



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2050
Drift@ 9550

0.1483
0.166
0.3024
0.2583
0.2563



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2016
Drift@ 6204
Drift@ 8982
Drift@ 14574
Drift@ 15828
Drift@ 17373
Drift@ 18278
Drift@ 18910

0.1396
0.1482
0.2992
0.245
0.2371



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2016
Drift@ 5810
Drift@ 9786
Drift@ 15856
Drift@ 16593
Drift@ 17320
Drift@ 18275
Drift@ 18847

0.1419
0.1496
0.304
0.2485
0.2412



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 2016
Drift@ 5810
Drift@ 9786
Drift@ 15856
Drift@ 16593
Drift@ 17320
Drift@ 18275
Drift@ 18847

0.1419
0.1496
0.304
0.2485
0.2412



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 800
Drift@ 1200
Drift@ 1600
Drift@ 2000
Drift@ 9800
Drift@ 10200
Drift@ 10600
Drift@ 11000
Drift@ 11400
Drift@ 11800
Drift@ 12400
Drift@ 12800
Drift@ 18600
Drift@ 19000

0.119
0.2058
0.2828
0.2127
0.2048



  0%|          | 0/19300 [00:00<?, ?it/s]

Drift@ 391
Drift@ 2285
Drift@ 7718
Drift@ 10518
Drift@ 15103

0.1562
0.1304
0.3021
0.2701
0.2627

