In [1]:
import sys
sys.path.append("..")
from divexp import *
from detect import *

import numpy as np
np.float = float
from skmultiflow.data import ConceptDriftStream

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from wrappers import AgrawalWrapper, SEAWrapper, HyperplaneWrapper

In [2]:
def train_and_drift(DataSource, drift=True, ds_kwargs={}, train_size=5_000, n_batches=50, batch_size=200, ClfModel=DecisionTreeClassifier, clf_kwargs={}):
    tot_samples = n_batches * batch_size # total number of samples
    position = (n_batches // 2) * batch_size # "center" of the drift
    width = (n_batches // 4) * batch_size # size of the transitory

    subgroup_metric = "accuracy"
    overall_metric = accuracy_score
    minsup = 0.1
    win_size = 5
    reference_window = (0,win_size)
    current_window = (n_batches - win_size, n_batches)

    data_source = DataSource(**ds_kwargs)
    stream = data_source.stream
    drift_stream = data_source.drift_stream

    if drift:
        cds = ConceptDriftStream(stream=stream, drift_stream=drift_stream, position=position, width=width)
    else:
        cds = ConceptDriftStream(stream=stream, drift_stream=stream, position=position, width=width) # no drift!

    X_train, y_train = stream.next_sample(train_size)

    clf = ClfModel(**clf_kwargs)
    clf.fit(X_train, y_train)
    
    df_meta = data_source.get_metadata(X_train)
    matches = compute_matches(df_meta, minsup=minsup)
    print("# FI", len(matches.fi))

    divs = []
    matches_ts_list = []
    scores = []

    for start_ndx in range(0, tot_samples, batch_size):
        X_batch, y_batch = cds.next_sample(batch_size)
        
        y_pred = clf.predict(X_batch)

        # Workaround for multi-class problems -- works with accuracy!
        y_pred = y_batch == y_pred
        y_batch = np.ones(len(y_batch))

        scores.append(overall_metric(y_batch, y_pred))

        df_batch_bin = data_source.get_metadata(X_batch)
        matches_ts = compute_matches(df_batch_bin, fi=matches.fi)
        matches_ts = Matches(matches=matches_ts.matches.astype(int), fi=matches.fi)

        divs.append(div_explorer(matches_ts, y_batch, y_pred, [subgroup_metric]))
        matches_ts_list.append(matches_ts)

    delta, t_stat = detect_singlebatch(divs, subgroup_metric, reference_window, current_window)

    return delta.min(), t_stat.max()

In [3]:
n_exp = {
    True: 5, # number for "with drift"
    False: 5 # number for "without drift"
}
train_size = 5_000

n_batches  = 50
batch_size = 200

tstats = np.empty(sum(n_exp.values()))
deltas = np.empty(sum(n_exp.values()))
gt = np.empty(sum(n_exp.values()))

exp_type = "hyper"

if exp_type == "sea":
    DataClass = SEAWrapper
    data_kwargs = {"noise_percentage": 0.7}
elif exp_type == "agrawal":
    DataClass = AgrawalWrapper
    data_kwargs = {"perturbation": 0.7}
elif exp_type == "hyper":
    DataClass = HyperplaneWrapper
    data_kwargs = {"noise_percentage": 0.1}


i = 0
t0 = time()
for drift in n_exp:
    for exp in range(n_exp[drift]):

        delta, tstat = train_and_drift(DataClass,
                                       ds_kwargs={ "random_state": i, **data_kwargs },
                                       drift=drift,
                                       train_size=train_size,
                                       n_batches=n_batches,
                                       batch_size=batch_size,
                                       ClfModel=DecisionTreeClassifier,
                                       clf_kwargs={}
        )

        deltas[i] = delta
        tstats[i] = tstat
        gt[i] = drift
        print(exp, drift, tstats[i], deltas[i])
        
        i += 1
t1 = time()
print(t1 - t0)

# FI 50


In [21]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

fpr, tpr, thresh = roc_curve(gt, tstats)
plt.plot(fpr, tpr)

best_thresh = thresh[(tpr - fpr).argmax()]
print(best_thresh)
plt.scatter([fpr[(tpr - fpr).argmax()]], [tpr[(tpr - fpr).argmax()]], marker='x', c='r')

ValueError: continuous format is not supported