# Development Notebook

I use this notebook as a development tool.

# Preliminaries

In [1]:
%load_ext lab_black

## Imports

In [2]:
import numpy as np
import pandas as pd
import os

import sklearn
from sklearn.datasets import load_iris

In [3]:
from pathlib import Path

In [4]:
import elki_interface

In [5]:
from elki_interface import ELKI_FILEPATH

ELKI_FILEPATH

PosixPath('/home/zissou/repos/elki_interface/src/elki_interface/elki.jar')

In [6]:
from elki_interface import Hics

In [7]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


# Data

In [8]:
iris = load_iris()

X = iris.get("data")
y = iris.get("target")

matrix = np.c_[X, y]

# Problematic Datasets

In [9]:
data_dir = Path().absolute().parent / "data" / "raw"

fps = list(data_dir.glob("*.csv"))
fps.sort()

datasets = {
    fp.stem: pd.read_csv(fp) if "hidim" in str(fp) else pd.read_csv(fp, header=None)
    for fp in fps
}
datasets.keys()

dict_keys(['hepatitis_hidim', 'parkinson_hidim', 'parkinson_lodim', 'parkison_lodim_02'])

# Sandbox

This is where functions and classes are tested.

In [10]:
clf = Hics(k=8, contamination=0.05, verbose=True)

## Iris

This works

In [11]:
iris = load_iris()

X = iris.get("data")

In [12]:
X.shape

(150, 4)

In [13]:
clf.fit(X)

[2020-12-05 11:56:42+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-05 11:56:42+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-05 11:56:43+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': Finished task run for task with final state: 'Success'


[2020-12-05 11:56:43+0100] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [14]:
clf.scores

array([2.98416907, 3.00970019, 3.05189107, 2.97883606, 2.93793776,
       3.4826541 , 3.12943677, 2.91093459, 3.27971282, 3.23053692,
       3.17501646, 3.12486945, 2.93330861, 4.7124936 , 4.12108075,
       4.00820242, 3.61741872, 3.15005977, 3.71219027, 3.42662315,
       3.80473102, 3.67523757, 5.14601329, 4.32050139, 4.50946958,
       2.99529333, 3.74801667, 3.01887214, 3.02152007, 2.98515275,
       3.01288077, 3.62597436, 3.72966674, 3.29632634, 3.00127033,
       3.33991176, 3.29758528, 3.2957921 , 3.230999  , 2.95310972,
       3.06507547, 4.38971726, 3.17038214, 4.46590409, 4.68415314,
       2.98388579, 3.14127389, 3.05857995, 3.30822992, 2.92115927,
       3.31984592, 2.94375741, 3.27120954, 3.12836267, 3.00006135,
       3.13464058, 3.01289926, 4.44752396, 3.19145995, 4.1083091 ,
       4.30650015, 3.25429394, 3.91823128, 3.01570925, 3.60803876,
       3.03004857, 3.29272267, 3.41281846, 3.24798484, 3.28633164,
       3.09596641, 3.02393959, 3.0971007 , 3.3321903 , 3.06485

## Hepatitis

In [15]:
df = datasets["hepatitis_hidim"]
X = df[[c for c in df.columns if c != "Y"]].values
y = df.Y.values

print(X.shape)

clf.fit(X)

(70, 95)
[2020-12-05 11:56:43+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-05 11:56:43+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-05 11:56:57+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': Finished task run for task with final state: 'Success'


[2020-12-05 11:56:57+0100] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [16]:
clf.scores

array([1.08090157, 1.02993063, 1.0060091 , 1.01387941, 0.99484316,
       1.09968231, 1.00290502, 0.98521475, 0.96905462, 0.98384016,
       1.01912587, 1.09047177, 1.10887601, 1.02040623, 1.05637999,
       0.99167379, 0.99481854, 1.00610286, 1.03499182, 1.00608641,
       1.02779054, 1.00963669, 1.04133423, 1.01605345, 0.96635863,
       0.98160455, 1.02233758, 1.0312645 , 1.07097636, 1.01483506,
       1.01462735, 1.02400833, 1.00635475, 0.98926791, 1.05752417,
       1.00878429, 1.01365139, 1.00922788, 1.12614853, 0.98975563,
       0.99940736, 0.9853391 , 1.03430203, 0.98792945, 0.97746248,
       0.97852983, 0.99308466, 1.01162025, 0.98271738, 0.99158822,
       1.00976035, 1.19838423, 1.02961213, 0.9896861 , 1.02552996,
       0.98760749, 1.02441055, 0.98816887, 1.02398834, 1.02785999,
       0.98916502, 0.98534177, 0.98820512, 1.00989067, 0.97397051,
       0.98852647, 0.99495594, 0.99316531, 0.98637279, 1.05575966])

In [17]:
clf.contamination

0.05

In [18]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]


## Parkinson HD

In [19]:
df = datasets["parkinson_hidim"]
X = df[[c for c in df.columns if c != "Y"]].values
y = df.Y.values

print(X.shape)

clf.fit(X)

(50, 110)
[2020-12-05 11:56:57+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-05 11:56:57+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-05 11:57:04+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': Finished task run for task with final state: 'Success'


[2020-12-05 11:57:04+0100] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [20]:
clf.scores

array([1.17883767, 1.50919857, 1.18972222, 1.25312891, 1.23489502,
       1.31028777, 1.32121486, 1.28263983, 0.98114417, 0.9848559 ,
       0.96862549, 0.96736693, 0.99720755, 1.04603946, 1.0456078 ,
       1.00987846, 1.01453798, 1.02639321, 1.01242358, 0.98094137,
       1.05753138, 1.05100487, 1.07973194, 1.07997049, 1.02963689,
       1.07035626, 1.00199652, 0.99885552, 1.02359379, 1.17055334,
       1.15275999, 1.00287902, 0.99453774, 0.98332843, 0.98582988,
       0.96748442, 0.98384548, 1.07857977, 1.43604128, 1.21142765,
       1.39766375, 1.43079317, 1.7459443 , 1.35789397, 1.22121228,
       1.28902769, 1.23451635, 1.87338733, 1.64515537, 1.36298934])

In [21]:
clf.labels

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.])

In [22]:
print(y)

[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Parkinson LD

In [23]:
df = datasets["parkinson_lodim"]
X = df[df.columns[:-2]].values
y = np.array([1 if v == "'yes'" else 0 for v in df[df.columns[-1]].values])

print(X.shape)

clf.fit(X)

(50, 22)
[2020-12-05 11:57:04+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-05 11:57:04+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-05 11:57:13+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': Finished task run for task with final state: 'Success'


[2020-12-05 11:57:13+0100] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [24]:
clf.scores

array([29.32475948, 21.95694026, 17.38266336, 18.32086942, 17.21383597,
       18.44672553, 19.38111097, 18.3839736 , 14.94096644, 15.0163616 ,
       14.8214936 , 14.84884354, 14.80141039, 17.85503011, 17.09798886,
       14.76669904, 15.9630236 , 16.05591498, 17.01307461, 18.64893863,
       16.05090279, 15.69021231, 15.88834804, 15.29037542, 15.78630653,
       16.12876093, 16.8875634 , 14.65873752, 17.72934149, 19.77247841,
       18.24397327, 16.95958366, 14.63573792, 14.66500136, 14.80369932,
       14.78585763, 14.73978208, 14.76811392, 19.0584965 , 22.38063197,
       19.07760614, 26.26151387, 31.60963768, 20.18862268, 18.09956587,
       19.24209111, 18.29534979, 28.27657353, 20.38473887, 20.1217524 ])

In [25]:
clf.labels

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [26]:
print(y)

[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Parkinson LD 02

In [27]:
df = datasets["parkison_lodim_02"]
X = df[df.columns[:-2]].values
y = np.array([1 if v == "'yes'" else 0 for v in df[df.columns[-1]].values])

print(X.shape)

clf.fit(X)

(50, 22)
[2020-12-05 11:57:13+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'fit'


INFO:prefect.FlowRunner:Beginning Flow run for 'fit'


[2020-12-05 11:57:13+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Starting task run...


INFO:prefect.TaskRunner:Task 'ShellTask': Starting task run...


[2020-12-05 11:57:22+0100] INFO - prefect.TaskRunner | Task 'ShellTask': Finished task run for task with final state: 'Success'


INFO:prefect.TaskRunner:Task 'ShellTask': Finished task run for task with final state: 'Success'


[2020-12-05 11:57:22+0100] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


INFO:prefect.FlowRunner:Flow run SUCCESS: all reference tasks succeeded


In [28]:
clf.scores

array([147.87330865,  38.23183743,  36.1208883 ,  37.92993383,
        36.72303048,  37.27858963,  38.03564852,  37.73151999,
        31.13367544,  31.0367414 ,  31.14675863,  31.56583984,
        35.57473365,  52.09663919,  35.91498855,  35.92225551,
        36.99679982,  36.45470669,  38.73433422,  39.58930481,
        36.79930068,  35.65224511,  41.39959114,  34.29387565,
        32.15189384,  33.96944072,  31.45562846,  31.58307971,
        50.53051718,  32.41295999,  30.65164468,  41.16364934,
        32.62908595,  40.56584023,  35.18065011,  33.82102422,
        32.25766058,  30.62926229,  31.89018102,  78.04204257,
        69.28443769, 173.94604449, 169.55348486,  34.37014463,
        42.9486013 ,  49.02601878,  41.61385143,  51.78180312,
       117.29926223,  44.96311812])

In [29]:
np.where(clf.labels == 1)[0]

array([41, 42])

In [30]:
clf.fit_command

'java -jar /home/zissou/repos/elki_interface/src/elki_interface/elki.jar KDDCLIApplication -db HashmapDatabase -dbc.in /tmp/tmpi53e2eeh.csv -algorithm outlier.meta.HiCS -lof.k 8 -evaluator NoAutomaticEvaluation -resulthandler tutorial.outlier.SimpleScoreDumper'