# Tutorial II: Using Runner Basics


In [10]:
%matplotlib notebook

First we have to import the required modules. Let's also disable some annoying warnings.

In [11]:
import sys
import warnings
from abc import ABC, abstractmethod

import matplotlib.pyplot as plt
import numpy as np
from hdbscan import HDBSCAN
from numba.errors import NumbaDeprecationWarning, NumbaWarning
from numpy.random import RandomState
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

from dpemu.nodes import Array
from dpemu import runner
from dpemu.dataset_utils import load_digits_, load_mnist, load_fashion
from dpemu.ml_utils import reduce_dimensions
from dpemu.plotting_utils import visualize_best_model_params, visualize_scores, visualize_classes, \
    print_results_by_model, visualize_interactive_plot
from dpemu.problemgenerator.filters import GaussianNoise, Clip

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)
warnings.simplefilter("ignore", category=NumbaWarning)

In [12]:
class Preprocessor:
    def __init__(self):
        self.random_state = RandomState(42)

    def run(self, _, data, params):
        reduced_data = reduce_dimensions(data, self.random_state)
        return None, reduced_data, {"reduced_data": reduced_data}

In [13]:
class AbstractModel(ABC):

    def __init__(self):
        self.random_state = RandomState(42)

    @abstractmethod
    def get_fitted_model(self, data, params):
        pass

    def run(self, _, data, params):
        labels = params["labels"]
        fitted_model = self.get_fitted_model(data, params)
        return {
            "AMI": round(adjusted_mutual_info_score(labels, fitted_model.labels_, average_method="arithmetic"), 3),
            "ARI": round(adjusted_rand_score(labels, fitted_model.labels_), 3),
        }


class KMeansModel(AbstractModel):

    def __init__(self):
        super().__init__()

    def get_fitted_model(self, data, params):
        labels = params["labels"]
        n_classes = len(np.unique(labels))
        return KMeans(n_clusters=n_classes, random_state=self.random_state).fit(data)


class AgglomerativeModel(AbstractModel):

    def __init__(self):
        super().__init__()

    def get_fitted_model(self, data, params):
        labels = params["labels"]
        n_classes = len(np.unique(labels))
        return AgglomerativeClustering(n_clusters=n_classes).fit(data)


class HDBSCANModel(AbstractModel):

    def __init__(self):
        super().__init__()

    def get_fitted_model(self, data, params):
        return HDBSCAN(
            min_samples=params["min_samples"],
            min_cluster_size=params["min_cluster_size"]
        ).fit(data)

In [14]:
def get_data(argv):
    if argv[1] == "digits":
        data, labels, label_names, dataset_name = load_digits_(int(argv[2]))
    elif argv[1] == "mnist":
        data, labels, label_names, dataset_name = load_mnist(int(argv[2]))
    else:
        data, labels, label_names, dataset_name = load_fashion(int(argv[2]))
    return data, labels, label_names, dataset_name

In [15]:
def get_err_root_node():
    err_root_node = Array()
    err_root_node.addfilter(GaussianNoise("mean", "std"))
    err_root_node.addfilter(Clip("min_val", "max_val"))
    return err_root_node

In [16]:
def get_err_params_list(data):
    min_val = np.amin(data)
    max_val = np.amax(data)
    std_steps = np.linspace(0, max_val, num=8)
    err_params_list = [{"mean": 0, "std": std, "min_val": min_val, "max_val": max_val} for std in std_steps]
    return err_params_list

In [17]:
def get_model_params_dict_list(data, labels):
    n_data = data.shape[0]
    divs = [12, 25, 50]
    min_cluster_size_steps = [round(n_data / div) for div in divs]
    min_samples_steps = [1, 10]
    return [
        {"model": KMeansModel, "params_list": [{"labels": labels}]},
        {"model": AgglomerativeModel, "params_list": [{"labels": labels}]},
        {"model": HDBSCANModel, "params_list": [{
            "min_cluster_size": min_cluster_size,
            "min_samples": min_samples,
            "labels": labels
        } for min_cluster_size in min_cluster_size_steps for min_samples in min_samples_steps]},
    ]

In [18]:
def visualize(df, label_names, dataset_name, data):
    visualize_scores(df, ["AMI", "ARI"], [True, True], "std",
                     f"{dataset_name} clustering scores with added gaussian noise")
    visualize_best_model_params(df, "HDBSCAN #1", ["min_cluster_size", "min_samples"], ["AMI", "ARI"], [True, True],
                                "std", f"Best parameters for {dataset_name} clustering")
    visualize_classes(df, label_names, "std", "reduced_data", "labels", "tab10",
                      f"{dataset_name} (n={data.shape[0]}) classes with added gaussian noise")


In [19]:
def main(argv):
    if len(argv) != 3 or argv[1] not in ["digits", "mnist", "fashion"]:
        exit(0)
    data, labels, label_names, dataset_name = get_data(argv)

    df = runner.run(
        train_data=None,
        test_data=data,
        preproc=Preprocessor,
        preproc_params=None,
        err_root_node=get_err_root_node(),
        err_params_list=get_err_params_list(data),
        model_params_dict_list=get_model_params_dict_list(data, labels),
    )

    print_results_by_model(df, ["labels", "reduced_data"])
    visualize(df, label_names, dataset_name, data)

In [20]:
main(["", "digits", "1797"])

  0%|          | 0/8 [00:00<?, ?it/s] 12%|█▎        | 1/8 [00:31<03:37, 31.05s/it] 25%|██▌       | 2/8 [00:31<02:10, 21.82s/it] 38%|███▊      | 3/8 [00:32<01:17, 15.48s/it] 50%|█████     | 4/8 [00:33<00:45, 11.42s/it] 62%|██████▎   | 5/8 [00:54<00:42, 14.09s/it]100%|██████████| 8/8 [00:55<00:00,  9.95s/it]


Agglomerative #1
     AMI    ARI  max_val  mean  min_val        std  time_used_err  time_used_mod  time_used_preproc
0  0.901  0.834     16.0     0      0.0   0.000000       0.049387       0.120157          30.463160
1  0.854  0.790     16.0     0      0.0   2.285714       0.013834       0.115815          30.609410
2  0.753  0.652     16.0     0      0.0   4.571429       0.024574       0.121857          30.723765
3  0.488  0.388     16.0     0      0.0   6.857143       0.070603       0.641763          31.091359
4  0.302  0.218     16.0     0      0.0   9.142857       0.009290       0.118830          22.535472
5  0.185  0.119     16.0     0      0.0  11.428571       0.011028       0.112134          21.923259
6  0.105  0.061     16.0     0      0.0  13.714286       0.016952       0.114525          21.552048
7  0.067  0.038     16.0     0      0.0  16.000000       0.032740       0.143269          20.492355
HDBSCAN #1
      AMI    ARI  max_val  mean  min_cluster_size  min_samples  min_val 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>