# Development Notebook

I use this notebook as a development tool.

# Preliminaries

In [1]:
%load_ext lab_black

## Imports

In [2]:
import numpy as np
import pandas as pd
import os

import sklearn
from sklearn.datasets import load_iris

In [3]:
from pathlib import Path

In [4]:
import elki_interface

In [5]:
from elki_interface import ELKI_FILEPATH

ELKI_FILEPATH

PosixPath('/home/zissou/repos/elki_interface/src/elki_interface/elki.jar')

In [6]:
from elki_interface import Hics

In [7]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


# Data

In [8]:
iris = load_iris()

X = iris.get("data")
y = iris.get("target")

matrix = np.c_[X, y]

# Problematic Datasets

In [9]:
data_dir = Path().absolute().parent / "data" / "raw"

fps = list(data_dir.glob("*.csv"))
fps.sort()

datasets = {
    fp.stem: pd.read_csv(fp) if "hidim" in str(fp) else pd.read_csv(fp, header=None)
    for fp in fps
}
datasets.keys()

dict_keys(['hepatitis_hidim', 'parkinson_hidim', 'parkinson_lodim'])

# Sandbox

This is where functions and classes are tested.

In [10]:
clf = Hics(k=8, contamination=0.05, verbose=True)

## Iris

This works

In [11]:
iris = load_iris()

X = iris.get("data")

In [12]:
X.shape

(150, 4)

In [13]:
clf.fit(X)

[2020-12-04 19:35:44] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-04 19:35:44] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-04 19:35:44] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': finished task run for task with final state: 'Success'


[2020-12-04 19:35:44] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [14]:
clf.scores

array([3.92060449, 3.9714986 , 4.20191892, 3.99625246, 3.93836452,
       4.6406238 , 4.11992596, 3.86088215, 4.32747645, 4.037644  ,
       4.19854531, 4.25285516, 4.01922113, 6.06074837, 5.54637646,
       5.45528248, 5.0597396 , 3.94578294, 5.02861552, 4.35185651,
       5.04532432, 4.60644204, 7.37667419, 5.45855489, 6.89427353,
       4.08112094, 4.72644208, 3.93317622, 3.95886283, 4.07314791,
       4.07403064, 4.60639639, 4.55634273, 4.8755275 , 3.99752237,
       4.94362967, 4.47687679, 4.11025738, 4.32509206, 3.88697588,
       4.11337891, 7.68223755, 4.26259517, 5.84769398, 6.1788514 ,
       4.0262978 , 4.32755772, 4.06442175, 4.27330723, 3.85335617,
       4.38242183, 3.97968044, 4.38738581, 4.13770832, 3.92582756,
       4.32149579, 4.1553098 , 5.62560655, 4.15918533, 4.64446475,
       5.42062343, 4.38940256, 4.82811573, 4.02990535, 5.34138774,
       3.97448338, 4.09627032, 4.37095666, 4.79801478, 4.1771353 ,
       4.10509825, 4.13454858, 4.33009928, 4.4533604 , 4.06079

## Hepatitis

In [15]:
df = datasets["hepatitis_hidim"]
X = df[[c for c in df.columns if c != "Y"]].values
y = df.Y.values

print(X.shape)

clf.fit(X)

(70, 95)
[2020-12-04 19:35:44] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-04 19:35:45] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-04 19:35:49] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': finished task run for task with final state: 'Success'


[2020-12-04 19:35:49] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [16]:
clf.scores

array([0.99825907, 0.97631549, 1.03843678, 0.99059708, 1.03972804,
       1.11069904, 1.00807743, 1.03523831, 1.06141408, 0.99383399,
       1.01529975, 1.16013941, 1.14642453, 1.04997765, 0.99469787,
       0.96366583, 0.99996103, 1.00835031, 1.02100668, 0.97234597,
       1.08827516, 1.0328575 , 0.9902499 , 1.02898782, 0.97711229,
       0.99961806, 0.948578  , 0.97787377, 1.20132469, 1.03024961,
       1.00935774, 1.05646899, 0.99502536, 1.01319584, 0.97876648,
       1.03196092, 0.99878253, 0.9829162 , 1.13414079, 0.99952907,
       1.078128  , 0.98379629, 1.02385388, 0.97197533, 0.99745824,
       0.99147905, 1.03971565, 0.99680557, 1.01572192, 1.10072612,
       1.01873769, 1.18523806, 1.15832764, 0.99374167, 0.99200473,
       1.01825094, 1.14837341, 1.06418244, 0.99840154, 0.98516562,
       1.01668015, 1.03934819, 1.03085681, 1.03459065, 0.9720542 ,
       0.96657946, 1.0245213 , 0.98975031, 1.0239703 , 0.97881133])

In [17]:
clf.contamination

0.05

In [18]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]


## Parkinson HD

In [19]:
df = datasets["parkinson_hidim"]
X = df[[c for c in df.columns if c != "Y"]].values
y = df.Y.values

print(X.shape)

clf.fit(X)

(50, 110)
[2020-12-04 19:35:49] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-04 19:35:49] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-04 19:36:01] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': finished task run for task with final state: 'Success'


[2020-12-04 19:36:01] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [20]:
clf.scores

array([22.59202046, 11.85267241,  7.84092958,  8.42474763,  7.4639861 ,
        9.22405084,  9.70911001,  8.55712196,  7.29226851,  7.37720836,
        7.05990715,  7.04810746,  6.7459055 ,  7.60194184,  7.37120502,
        7.69821393,  7.49586582,  7.65679371,  7.74088697,  8.00937689,
        7.40383762,  7.50518735,  7.22112951,  7.13582089,  7.82793508,
        7.80845342,  7.44371197,  7.14719737,  7.42416656,  9.02547744,
        7.26136575,  7.4740309 ,  6.94752681,  7.0930552 ,  7.15473917,
        6.99179499,  7.04811107,  7.51396643,  9.88305982,  9.75782497,
        8.99332186,  8.10756783,  8.05054139,  9.44398831,  8.17236036,
       13.97802345,  8.21595619, 11.64923005,  8.09492104,  7.48374719])

In [21]:
clf.labels

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [22]:
print(y)

[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Parkinson LD

In [23]:
df = datasets["parkinson_lodim"]
X = df[df.columns[:-2]].values
y = np.array([1 if v == "'yes'" else 0 for v in df[df.columns[-1]].values])

print(X.shape)

clf.fit(X)

(50, 22)
[2020-12-04 19:36:01] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-04 19:36:01] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-04 19:36:13] INFO - prefect.TaskRunner | Task 'ShellTask': finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': finished task run for task with final state: 'Success'


[2020-12-04 19:36:13] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [24]:
clf.scores

array([122.11745853,  81.13859377,  58.53585862,  60.96458862,
        57.61609331,  61.72467086,  64.6179623 ,  60.38318607,
        48.54565582,  48.93224129,  48.825198  ,  48.1296627 ,
        48.48986192,  62.28582825,  50.01204794,  48.85230951,
        54.84272289,  54.59270921,  59.15923866,  59.47825407,
        56.84281899,  53.99129747,  49.83566842,  48.58499168,
        49.70229179,  54.13547442,  61.704147  ,  47.95466484,
        62.67180517,  63.09678466,  66.17545681,  60.13310494,
        47.5387663 ,  47.67343211,  48.65921923,  47.43522119,
        48.11755912,  48.20882286,  55.60626436,  79.44753843,
        52.86614459,  92.22609659, 102.37243351,  50.78288701,
        63.1151541 ,  79.1624791 ,  63.03799867,  95.04175359,
        67.60459574,  69.95483729])

In [25]:
clf.labels

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [26]:
print(y)

[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [27]:
np.where(clf.labels == 1)[0]

array([ 0, 42])

In [28]:
clf.fit_command

'java -jar /home/zissou/repos/elki_interface/src/elki_interface/elki.jar KDDCLIApplication -db HashmapDatabase -dbc.in /tmp/tmpx1w8zg5r.csv -algorithm outlier.meta.HiCS -lof.k 8 -evaluator NoAutomaticEvaluation -resulthandler tutorial.outlier.SimpleScoreDumper'