In [1]:
import warnings
warnings.filterwarnings('ignore')

import time
import math
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets

from micoes.microclusters.clustream import CluStream
from micoes.microclusters.denstream import DenStream

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)


%matplotlib notebook
%matplotlib inline

In [2]:
def gen_blob_dataset():
    np.random.seed(0)
    n_samples = 1500
    data, labels = datasets.make_blobs(n_samples=n_samples, random_state=8)
    return data, labels

In [3]:
def test_clustream_microcluster():
    data, labels = gen_blob_dataset()
    alpha = 3
    n_attributes = data.shape[1]
    n_microclusters = 12
    tau = 60 * 0.01
    initial = math.floor(0.1 * data.shape[0])
    initial_points = data[:initial+1]
    clu = CluStream(n_microclusters, n_attributes, alpha, tau)
    clu.initialize(initial_points)
    for point in data[initial+1:]:
        clu.online_update_microcluster(point, time.time())
    total_points = np.sum(np.array([mc.n_points for mc in clu.microclusters]))
    logging.info(total_points)
test_clustream_microcluster()

1500


In [4]:
def test_denstream_microcluster():
    data, labels = gen_blob_dataset()
    mu=data.shape[0]/(len(np.unique(labels))*data.shape[1])
    logging.info(f'mu is {mu}')
    den = DenStream(lamda=1, mu=mu, beta=0.01, eta=np.std(data), n_attributes=data.shape[1])
    initial = math.floor(0.1 * data.shape[0])
    initial_points = data[:initial+1]
    den.initialize(initial_points)
    logging.info(f'Initialization:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')
    logging.info(f'Online update')
    for point in data[initial+1:]:
        den.online_update_microcluster(point, time.time())
    logging.info(f'After online update:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')
    for mc in den.c_microclusters:
        logging.info(f'{mc.n_tpoints}')
    den.check_microclusters_status(0.001)
    logging.info(f'After some period of time:\n OMC is {len(den.o_microclusters)}\n PMC is {len(den.p_microclusters)}\n CMC is {len(den.c_microclusters)}')
test_denstream_microcluster()

mu is 250.0
Initialization:
 OMC is 2
 PMC is 0
 CMC is 0
Online update
After online update:
 OMC is 2
 PMC is 3
 CMC is 3
250.99470339256462
250.9949581699757
250.99509615743736
After some period of time:
 OMC is 0
 PMC is 3
 CMC is 3
