In [1]:
import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

import math
import numpy as np
import pandas as pd

import h5py
import scipy.io as sio

from collections import OrderedDict

from river import anomaly
from river import compose
from river import datasets
from river import metrics
from river import preprocessing

From skmultiflow/river documentation:
> Half-space trees (HST) are an online variant of isolation forests. They work well when anomalies are
    spread out. However, they do not work well if anomalies are packed together in windows.

In [2]:
from itertools import compress, product
def combinations(items):
    return (set(compress(items,mask)) for mask in product(*[[0,1]]*len(items)))

In [3]:
def compute_hst_score(x, model, attributes, y=None):
    """
    Parameters
    ----------
    x 
        a list or a numpy array
    y
        None
        0 for normal data
        1 for outlier
    """
    features = dict(zip(attributes, x))
    score = model.score_one(features)
    
    if y is None:
        model = model.learn_one(features)
    else:
        model = model.learn_one(features, y)
    return score, model

In [4]:
def compute_hst_score_for_every_subspace(df, epoch=2, label='label', window_size=60):
    """
    Parameters
    ----------
    df
        pandas dataframe
    """
    if label:
        Y = df[label]
        df = df.drop(label, axis=1)
    else:
        Y = None
        
    attribute_subsets = list(combinations(df.columns))
    scores_dict = {}
    logging.info(f'Total attribute spaces {len(attribute_subsets)}')
    check = 0
    for attributes in attribute_subsets:
        check += 1
        logging.info(f'Working on the {check}-attribute space')
        if not attributes:
            continue
        X = df[list(attributes)].values
        model = compose.Pipeline(
                preprocessing.MinMaxScaler(),
                anomaly.HalfSpaceTrees(seed=1, window_size=window_size)
            )
        for _ in range(epoch):
            i = 0
            scores = list()
            for x in X:
                if Y is not None:
                    score, model = compute_hst_score(x, model, list(attributes), y=Y[i])
                    i += 1
                else:
                    score, model = compute_hst_score(x, model, list(attributes))
                scores.append(score)
        scores_dict[tuple(attributes)] = scores
    return scores_dict

In [5]:
def combine_scores_dictionary(scores_dict1, scores_dict2):
    ds = [scores_dict1, scores_dict2]
    scores_dict = {}
    for key in scores_dict1.keys():
        scores_dict[key] = scores_dict1[key] + scores_dict2
    return scores_dict

In [6]:
def get_hst_score(scores_dict, object_id):
    """
    get hst scores for an object in every combinations of attribute. 
    order the attribute space based on the hst score in descending order
    """
    object_scores = {}
    for key, value in scores_dict.items():
        new_key = ', '.join(key)
        object_scores[key] = value[object_id]
    object_ordered_dict = OrderedDict(sorted(object_scores.items(), key=lambda x: x[1], reverse=True))
    return object_ordered_dict

In [7]:
def get_outlying_attributes(object_ordered_dict, num=1, score=False):
    """
    get list of the outlying attribute space
    example :
        >> get_outlying_attributes(object_ordered_dict)
        >> ['petal width (cm)']
        >> get_outlying_attributes(object_ordered_dict, num=2, score=True)
        >> [('petal width (cm)', 0.9964180039138943),
            ('petal length (cm), petal width (cm)', 0.9964180039138943)]
        
    """
    if not score:
        keys = list(object_ordered_dict.keys())[:num]
        return keys
    else:
        items = list(object_ordered_dict.items())[:num]
        return items

In [8]:
def generate_outlying_attribute_hst(df, label, num=1, score=False, epoch=2, remove_col_1=True, outlier_target=1, window_size=60):
    """
    Parameters
    ----------
    filepath
        string, filepath of csv file containing information whether a tuple/object is an outlier (1) or not (0)
    label
        string, the column that indicates whether a tuple/object is an outlier (1) or not (0)
    num
        int, number of outlying attribute to find
    score:
        bool, to determine whether to return hst outlier scores or not
    epoch:
        int, number of iteration needed to train the hst model
    """
    if remove_col_1:
        df = df.drop(df.columns[[0]], axis=1)
    outlier_indices = list(np.where(df[label] == 1)[0])
    
    scores_dict = compute_hst_score_for_every_subspace(df, epoch, label, window_size)
    
    outlying_attributes = list() 
    for i in outlier_indices:
        object_ordered_dict = get_hst_score(scores_dict, i)
        outlying_attribute = get_outlying_attributes(object_ordered_dict, num, score)
        outlying_attributes.append(outlying_attribute)
    results = {}
    results['outlier_indices'] = outlier_indices
    results['outlying_attributes'] = outlying_attributes
    return pd.DataFrame(results)

In [9]:
from sklearn import datasets
def example_3():
    iris = datasets.load_iris()
    iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                         columns= iris['feature_names'] + ['target'])
    setosa = iris_df[iris_df['target'] == 0.0]
    versicolor = iris_df[iris_df['target'] == 1.0]
    virginica = iris_df[iris_df['target'] == 2.0]
    row_df = pd.DataFrame([versicolor.iloc[0], virginica.iloc[0]])
    df = pd.concat([setosa, row_df], ignore_index=True)
    df["target"].replace({0.0: int(0), 1.0: int(1), 2.0 : int(1)}, inplace=True)
    scores_dict = compute_hst_score_for_every_subspace(df, epoch=2, label='target')
    object_50 = get_hst_score(scores_dict,50)
    object_51 = get_hst_score(scores_dict,51)
    return object_50, object_51
object_50, object_51 = example_3()
get_outlying_attributes(object_50, num=2, score=False)

NumExpr defaulting to 8 threads.
Total attribute spaces 16
Working on the 1-attribute space
Working on the 2-attribute space
Working on the 3-attribute space
Working on the 4-attribute space
Working on the 5-attribute space
Working on the 6-attribute space
Working on the 7-attribute space
Working on the 8-attribute space
Working on the 9-attribute space
Working on the 10-attribute space
Working on the 11-attribute space
Working on the 12-attribute space
Working on the 13-attribute space
Working on the 14-attribute space
Working on the 15-attribute space
Working on the 16-attribute space


[('sepal length (cm)', 'petal width (cm)'), ('sepal length (cm)',)]

In [10]:
def read_matlab_data_file(filepath):
    mat = sio.loadmat(filepath)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[1])]
    df = pd.DataFrame(data=mat['X'], columns=columns)
    df['label'] = mat['y']
    return df

In [11]:
def read_matlab_data_with_h5py(filepath):
    mat = {}
    f = h5py.File(filepath)
    for k, v in f.items():
        mat[k] = np.array(v)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[0])]
    df = pd.DataFrame(data=mat['X'].T, columns=columns)
    df['label'] = mat['y'].T
    return df

In [13]:
def http(window_size):
    filepath = r'../data/odds/http.mat'
    df = read_matlab_data_with_h5py(filepath)
    logging.info(df.shape)
    result = generate_outlying_attribute_hst(df, label='label', num=2, score=False, 
                                             epoch=2, remove_col_1=False, outlier_target=1,
                                             window_size=window_size)
    return result
# result = http(window_size=60)
# result.to_pickle('pickles/http_60.pckle')

In [14]:
def smtp(window_size):
    filepath = r'../data/odds/smtp.mat'
    df = read_matlab_data_with_h5py(filepath)
    logging.info(df.shape)
    result = generate_outlying_attribute_hst(df, label='label', num=2, score=False, 
                                             epoch=2, remove_col_1=False, outlier_target=1,
                                             window_size=window_size)
    return result
# result = smtp(window_size=60)
# result.to_pickle('pickles/smtp_60.pickle')

In [15]:
def mammography(window_size):
    filepath = r'../data/odds/mammography.mat'
    df = read_matlab_data_file(filepath)
    logging.info(df.shape)
    result = generate_outlying_attribute_hst(df, label='label', num=2, score=False, 
                                             epoch=2, remove_col_1=False, outlier_target=1,
                                             window_size=window_size)
    return result
# result = mammography(window_size=60)
# result.to_pickle('pickles/mammography_60.pickle')

In [16]:
def shuttle(window_size):
    filepath = r'../data/odds/shuttle.mat'
    df = read_matlab_data_file(filepath)
    logging.info(df.shape)
    result = generate_outlying_attribute_hst(df, label='label', num=2, score=False, 
                                             epoch=2, remove_col_1=False, outlier_target=1,
                                             window_size=window_size)
    return result
# result = shuttle(window_size=60)
# result.to_pickle('pickles/shuttle_60.pickle')

In [18]:
filepath = r'../data/odds/pendigits.mat'
df = read_matlab_data_file(filepath)
logging.info(df.shape)
df = df.drop('label', axis=1)
df.columns
attribute_space = combinations(df.columns)
logging.info(f'total attribute combinations {len(list(attribute_space))-1}')

(6870, 17)
total attribute combinations 65535


In [19]:
def wine(window_size):
    filepath = r'../data/odds/wine.mat'
    df = read_matlab_data_file(filepath)
    logging.info(df.shape)
    result = generate_outlying_attribute_hst(df, label='label', num=2, score=False, 
                                             epoch=2, remove_col_1=False, outlier_target=1,
                                             window_size=window_size)
    return result
# result = wine(window_size=60)
# result.to_pickle('pickles/wine_60.pickle')