In [1]:
import os
import sys
import re
import glob

In [2]:
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB

from classifier.detector_classifier import DetectorClassifier
from concept_drift.adwin import AdWin
from concept_drift.page_hinkley import PageHinkley
from evaluation.prequential import prequential

In [3]:
UTILS = "../utils/"
if UTILS not in sys.path:
    sys.path.append("../utils/")

from util import ql_ref_date
from util import recover_download_site_mab
from util import get_onlysite

In [4]:
def get_values(temp, dir_ctrl):
    dir_ctrl = os.path.join(temp, dir_ctrl)
    print("Reading",dir_ctrl)
    files = glob.glob(os.path.join(dir_ctrl, '*.log'))
    sarss = []
    sars = None
    file_id = 0
    for filename in sorted(files):
        ref = ql_ref_date(filename, split_text1='mab1-').replace('T', 'Z')
        site1 = get_onlysite(recover_download_site_mab(relative_path=temp, sta=1, ref_date=ref, prefix='sta-mab-'))
        site2 = get_onlysite(recover_download_site_mab(relative_path=temp, sta=2, ref_date=ref, prefix='sta-mab-'))

        with open(filename, 'r') as f:
            lines = f.readlines()
        for i in range(len(lines)):
            _l = lines[i]
            if 'iteration' in _l:
                sars = None
            if 'Frequency: ' in _l:
                f = re.findall(r"[-+]?\d*\.\d+|\d+", _l)
                if len(f) == 10:
                    # ok
                    sars = dict(zip(['Frequency', 'Medium busy', 'Busy time', 'Active time'], f[6:]))
            elif sars is not None:
                # ok, got at least Frequency...
                if '] AP0 txpower:' in _l:
                    f = re.findall(r"[-+]?\d*\.\d+|\d+", _l)
                    if len(f) == 11:
                        sars.update(dict(zip(['txpower', 'new_txpower', 'channel', 'new_channel'], f[7:])))
                    if 'new Medium busy' in sars:
                        sars.update({'file_id': file_id})
                        sars.update({'sites': (site1, site2)})
                        file_id += 1
                        sarss.append(sars)
                    sars = None
                elif '] rewards:' in _l:
                    sars['r'] = _l.split('] rewards:')[1].replace('\n', '').strip()
                    if 'None' in sars['r'] :
                        sars = None  # error, skip
                    else:
                        try:
                            ## find new state
                            for j in range(1, 10):
                                _ll = lines[i + j]
                                if 'Frequency: ' in _ll:                                  
                                    f = re.findall(r"[-+]?\d*\.\d+|\d+", _ll)
                                    if len(f) == 10:
                                        # ok
                                        new_sars = dict(zip(['new Medium busy', 'new Busy time', 'new Active time'], f[7:]))
                                        sars.update(new_sars)
                                    break
                        except (KeyError, IndexError):
                            sars = None  # error skip
    return sarss

## create a temporary dir

In [5]:
TEMP = 'temp'
if not os.path.exists(TEMP):
    os.mkdir(TEMP)

# extract MAB results to TEMP

In [6]:
files = glob.glob('../MAB1/data/*.tar.xz')

In [7]:
for f in files:
    print("Extracting {}".format(os.path.basename(f)))
    s = "tar -C {} -xJf {}".format(TEMP, f)
    # print(s)
    os.system(s)

Extracting sta2.tar.xz
Extracting removed.tar.xz
Extracting ap.tar.xz
Extracting ctrl.tar.xz
Extracting sta1.tar.xz


In [8]:
sarss = get_values(TEMP, 'ctrl')

Reading temp/ctrl


In [9]:
print("Found", len(sarss))

Found 237646


In [10]:
os.system("rm -fr {}".format(TEMP))

0

In [11]:
data = pd.DataFrame(sarss)

In [12]:
y = data['r'].astype('float')
y = np.sign(np.concatenate(([1], y[1:].values - y[:-1].values)))
y[y==-1] = 0  # y will have only 0s and 1s

In [13]:
X = data[['Active time', 'Medium busy', 'channel',
       'new Active time', 'new Busy time', 'new Medium busy',
       'new_channel', 'new_txpower', 'txpower']].values

# Concept Drift ADWIN

__ADWIN__

* BIFET, Albert; GAVALDA, Ricard. Learning from time-changing data with adaptive windowing. In: Proceedings of the 2007 SIAM international conference on data mining. Society for Industrial and Applied Mathematics, 2007. p. 443-448.


__Page-Hinckley Test__

* GAMA, João; SEBASTIÃO, Raquel; RODRIGUES, Pedro Pereira. On evaluating stream learning algorithms. Machine learning, v. 90, n. 3, p. 317-346, 2013.

In [14]:
def calculate_drift(X, y, 
                    n_train=1000, w=100,
                    lambda_=50,
                    clfs_label=["GaussianNB", "Page-Hinkley", "AdWin"],
                    plot_circles=["AdWin"]):
    clfs = []
    if "GaussianNB" in clfs_label:
        clfs.append(GaussianNB())
    if "Page-Hinkley" in clfs_label:
        clfs.append(DetectorClassifier(GaussianNB(), PageHinkley(lambda_=lambda_), np.unique(y)))
    if "AdWin" in clfs_label:
        clfs.append(DetectorClassifier(GaussianNB(), AdWin(), np.unique(y)))

    plt.title("Accuracy (exact match)")
    plt.xlabel("Instances")
    plt.ylabel("Accuracy")

    ax = plt.gca()
    y_max = y.shape[0]
    ax.set_xlim((0, y_max))
    ax.set_ylim((0, 1))

    ellipse_y = 0.05
    ellipse_x = ellipse_y * y_max / 2
    ellipse_color = {"GaussianNB": 'blue', "Page-Hinkley":'orange', "AdWin":'green'}
    
    for i in range(len(clfs)):
        print("\n{}:".format(clfs_label[i]))
        with np.errstate(divide='ignore', invalid='ignore'):
            y_pre, time = prequential(X, y, clfs[i], n_train)
        
        estimator = (y[n_train:] == y_pre) * 1
        acc_run = np.convolve(estimator, np.ones((w,)) / w, 'same')

        if clfs[i].__class__.__name__ == "DetectorClassifier":
            print("Drift detection: {}".format(clfs[i].change_detected))
            
            if len(plot_circles) > 0 and clfs_label[i] in plot_circles and len(clfs[i].detected_elements) > 0:
                acc = [acc_run[d] for d in clfs[i].detected_elements]
                points = [(x, y) for x, y in zip(clfs[i].detected_elements, acc)]
                print("Drift detected in", str(points))
                for x, y, in points:
                    # c = plt.Circle((x, y), 0.1, color='r')
                    c = Ellipse(xy=(x, y), width=ellipse_x, height=ellipse_y, angle=0, 
                                color=ellipse_color[clfs_label[i]], fill=False)
                    ax.add_artist(c)

        print("Mean acc within the window {}: {}".format(w, np.mean(acc_run)))
        if len(clfs_label) == 1:
            plt.plot(acc_run, "-", color=ellipse_color[clfs_label[i]])
        else:
            plt.plot(acc_run, "-", label=clfs_label[i], color=ellipse_color[clfs_label[i]])
        
    plt.legend(loc='lower right')
    plt.ylim([0, 1])
    plt.show()

In [15]:
n_train = 60 * 30  # 30 minutes

# varying the window size

In [None]:
for w in [16, 32, 64, 128, 256, 512]:
    calculate_drift(X, y, n_train=1000, w=w)


GaussianNB:
Mean acc within the window 16: 0.8669349471362288

Page-Hinkley:
Drift detection: 72
Mean acc within the window 16: 0.7387009499421077

AdWin:


# Only Google

In [None]:
data = pd.DataFrame(sarss)
data = data[data['sites'] == ('google', 'google')]

In [None]:
y = data['r'].astype('float')
y = np.sign(np.concatenate(([1], y[1:].values - y[:-1].values)))
y[y==-1] = 0  # y will have only 0s and 1s

In [None]:
X = data[['Active time', 'Medium busy', 'channel',
       'new Active time', 'new Busy time', 'new Medium busy',
       'new_channel', 'new_txpower', 'txpower']].values

In [None]:
calculate_drift(X, y, n_train=1000, w=16)

# Only one experiment

In [None]:
interval = 1800
for i in range(10):
    li = interval * i
    ls = li + 1800
    y1 = y[li:ls]
    X1 = X[li:ls]
    print("Experiment #{} from {} to {}".format(i, li, ls))
    calculate_drift(X1, y1, n_train=100, w=16, clfs_label=["Page-Hinkley", "AdWin"])

# Changing lambda from Page-Hinkley

In [None]:
interval = 1800
for lambda_ in [10, 20, 30, 40, 50, 100]:
    for i in range(1):
        li = interval * i
        ls = li + 1800
        y1 = y[li:ls]
        X1 = X[li:ls]
        print("Experiment #{} from {} to {}".format(i, li, ls))
        calculate_drift(X1, y1, n_train=100, w=16, lambda_=lambda_, clfs_label=["Page-Hinkley"])