In [1]:
import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

import math
import numpy as np
import pandas as pd

import h5py
import scipy.io as sio

from collections import OrderedDict

In [2]:
from itertools import compress, product
def combinations(items):
    return (set(compress(items,mask)) for mask in product(*[[0,1]]*len(items)))

def read_matlab_data_file(filepath):
    mat = sio.loadmat(filepath)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[1])]
    df = pd.DataFrame(data=mat['X'], columns=columns)
    df['label'] = mat['y']
    return df

def read_matlab_data_with_h5py(filepath):
    mat = {}
    f = h5py.File(filepath)
    for k, v in f.items():
        mat[k] = np.array(v)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[0])]
    df = pd.DataFrame(data=mat['X'].T, columns=columns)
    df['label'] = mat['y'].T
    return df


In [3]:
dataset = list()
total_data = list()
n_features = list()
total_search_space = list()
n_outliers = list()

## ForestCover Dataset

http://odds.cs.stonybrook.edu/forestcovercovertype-dataset/


In [4]:
filepath = r'../data/odds/cover.mat'
df = read_matlab_data_file(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

(286048, 11)
total attribute combinations 1023


Currently we only generate ground truth for the outliers included in the first 10% of the dataset. 

In [5]:
cover = pd.read_pickle('pickles/forest_cover_60_10th.pickle')
cover

Unnamed: 0,outlier_indices,outlying_attributes
0,825,"[(A1,), (A2, A1)]"
1,828,"[(A1,), (A2, A1)]"
2,831,"[(A1,), (A2, A1)]"
3,833,"[(A1,), (A2, A1)]"
4,834,"[(A1,), (A2, A1)]"
...,...,...
2155,4114,"[(A3,), (A7,)]"
2156,4115,"[(A2,), (A8,)]"
2157,4117,"[(A2,), (A7,)]"
2158,4121,"[(A2,), (A8,)]"


In [6]:
dataset.append('forest cover')
total_data.append(math.floor(df.shape[0]/10))
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(cover.shape[0])

## Http (KDDCup1999) Dataset
http://odds.cs.stonybrook.edu/http-kddcup99-dataset/ 

In [7]:
filepath = r'../data/odds/http.mat'
df = read_matlab_data_with_h5py(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

(567498, 4)
total attribute combinations 7


In [8]:
http = pd.read_pickle('pickles/http_60.pickle')
http

Unnamed: 0,outlier_indices,outlying_attributes
0,201669,"[(A3,), (A2,)]"
1,311452,"[(A2,), (A2, A3)]"
2,311453,"[(A2,), (A2, A3)]"
3,311454,"[(A2,), (A2, A3)]"
4,311455,"[(A2,), (A2, A3)]"
...,...,...
2206,514435,"[(A3,), (A2,)]"
2207,514436,"[(A3,), (A2,)]"
2208,514437,"[(A3,), (A2,)]"
2209,514438,"[(A3,), (A2,)]"


In [9]:
dataset.append('http')
total_data.append(df.shape[0])
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(http.shape[0])

## Smtp (KDDCup1999) Dataset
http://odds.cs.stonybrook.edu/smtp-kddcup99-dataset/

In [10]:
filepath = r'../data/odds/smtp.mat'
df = read_matlab_data_with_h5py(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

(95156, 4)
total attribute combinations 7


In [11]:
smtp = pd.read_pickle('pickles/smtp_60.pickle')
smtp

Unnamed: 0,outlier_indices,outlying_attributes
0,14691,"[(A2,), (A1, A2)]"
1,14741,"[(A2,), (A1, A2)]"
2,14788,"[(A2,), (A1, A2)]"
3,14832,"[(A2,), (A1, A2)]"
4,14887,"[(A2,), (A1, A2)]"
5,14966,"[(A2,), (A1, A2)]"
6,15015,"[(A2,), (A1, A2)]"
7,15042,"[(A2,), (A1, A2)]"
8,15098,"[(A2,), (A1, A2)]"
9,15164,"[(A2,), (A1, A2)]"


In [12]:
dataset.append('smtp')
total_data.append(df.shape[0])
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(smtp.shape[0])

## Mammography Dataset
http://odds.cs.stonybrook.edu/mammography-dataset/

In [13]:
filepath = r'../data/odds/mammography.mat'
df = read_matlab_data_file(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

(11183, 7)
total attribute combinations 63


In [14]:
mammography= pd.read_pickle('pickles/mammography_60.pickle')
mammography

Unnamed: 0,outlier_indices,outlying_attributes
0,1093,"[(A6,), (A4,)]"
1,1094,"[(A2,), (A4, A2)]"
2,1095,"[(A6,), (A4,)]"
3,1096,"[(A5,), (A4, A5)]"
4,1097,"[(A5,), (A4, A5)]"
...,...,...
255,11178,"[(A5,), (A4,)]"
256,11179,"[(A6,), (A5,)]"
257,11180,"[(A2,), (A5,)]"
258,11181,"[(A6,), (A4,)]"


In [15]:
dataset.append('mammography')
total_data.append(df.shape[0])
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(mammography.shape[0])

## Shuttle Dataset
http://odds.cs.stonybrook.edu/shuttle-dataset/dataset.append('forest cover')
total_data.append(math.floor(df.shape[0]/10))
n_features.append(df.shape[1])
total_search_space.append(len(list(attribute_space))-1)
n_outliers.append(cover.shape[0])

In [16]:
filepath = r'../data/odds/shuttle.mat'
df = read_matlab_data_file(filepath)
logging.info(df.shape[0])
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

49097
total attribute combinations 511


In [17]:
shuttle = pd.read_pickle('pickles/shuttle_60.pickle')
shuttle

Unnamed: 0,outlier_indices,outlying_attributes
0,0,"[(A1,), (A5,)]"
1,4,"[(A1,), (A7, A1)]"
2,18,"[(A1,), (A1, A5)]"
3,28,"[(A1,), (A7, A1)]"
4,38,"[(A1,), (A7, A1)]"
...,...,...
3506,49034,"[(A5,), (A9,)]"
3507,49045,"[(A1,), (A1, A3)]"
3508,49047,"[(A1,), (A1, A5)]"
3509,49091,"[(A1,), (A6, A1)]"


In [18]:
dataset.append('shuttle')
total_data.append(df.shape[0])
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(shuttle.shape[0])

## Wine Dataset
http://odds.cs.stonybrook.edu/wine-dataset/

In [19]:
filepath = r'../data/odds/wine.mat'
df = read_matlab_data_file(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
n_search_space = len(list(attribute_space))-1
logging.info(f'total attribute combinations {n_search_space}')

(129, 14)
total attribute combinations 8191


In [20]:
wine = pd.read_pickle('pickles/wine_60.pickle')
wine

Unnamed: 0,outlier_indices,outlying_attributes
0,0,"[(A13,), (A13, A10)]"
1,1,"[(A13,), (A1, A13)]"
2,2,"[(A11,), (A4, A13, A11)]"
3,3,"[(A13,), (A12, A13)]"
4,4,"[(A12, A6, A13), (A1, A6, A12, A13)]"
5,5,"[(A1, A7), (A1,)]"
6,6,"[(A12,), (A4, A12, A13)]"
7,7,"[(A13, A11), (A2, A13, A9, A11)]"
8,8,"[(A13,), (A6, A13)]"
9,9,"[(A13,), (A13, A11)]"


In [21]:
result = wine.loc[wine["outlier_indices"] == 5, "outlying_attributes"]
type(result)

NumExpr defaulting to 8 threads.


pandas.core.series.Series

In [22]:
dataset.append('wine')
total_data.append(df.shape[0])
n_features.append(df.shape[1])
total_search_space.append(n_search_space)
n_outliers.append(wine.shape[0])

## Summary

In [23]:
summary = {'dataset'   : dataset,
          'total_data' : total_data,
          'n_features' : n_features,
          'total_search_space' : total_search_space,
          'n_outliers' : n_outliers}

In [24]:
summary_df = pd.DataFrame(summary)
summary_df.sort_values(by = 'total_data')

Unnamed: 0,dataset,total_data,n_features,total_search_space,n_outliers
5,wine,129,13,8191,10
3,mammography,11183,6,63,260
0,forest cover,28604,10,1023,2160
4,shuttle,49097,9,511,3511
2,smtp,95156,3,7,30
1,http,567498,3,7,2211


In [25]:
summary_df.sort_values(by = 'total_search_space')

Unnamed: 0,dataset,total_data,n_features,total_search_space,n_outliers
1,http,567498,3,7,2211
2,smtp,95156,3,7,30
3,mammography,11183,6,63,260
4,shuttle,49097,9,511,3511
0,forest cover,28604,10,1023,2160
5,wine,129,13,8191,10
