# Example of combining multiple base outlier scores.

Four combination frameworks are demonstrated in this example:

- Average: take the average of all base detectors
- maximization : take the maximum score across all detectors as the score
- Average of Maximum (AOM)
- Maximum of Average (MOA)

In [2]:
from __future__ import division
from __future__ import print_function

import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

from pyod.models.knn import KNN
from pyod.models.combination import aom, moa, average, maximization
from pyod.utils.utility import standardizer
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print



In [3]:
mat_file = 'cardio.mat'

try:
    mat = loadmat(os.path.join('data', mat_file))

except TypeError:
    print('{data_file} does not exist. Use generated data'.format(
        data_file=mat_file))
    X, y = generate_data(train_only=True)  # load data
except IOError:
    print('{data_file} does not exist. Use generated data'.format(
        data_file=mat_file))
    X, y = generate_data(train_only=True)  # load data
else:
    X = mat['X']
  

    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


X_train_norm, X_test_norm = standardizer(X_train, X_test)

cardio.mat does not exist. Use generated data


In [4]:
print("Training data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)

Training data: (600, 2) (600,)
Test data: (400, 2) (400,)


In [5]:
n_clf = 20  

k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
          150, 160, 170, 180, 190, 200]

train_scores = np.zeros([X_train.shape[0], n_clf])
test_scores = np.zeros([X_test.shape[0], n_clf])

print('Initializing {n_clf} kNN detectors'.format(n_clf=n_clf))

for i in range(n_clf):
    k = k_list[i]

    clf = KNN(n_neighbors=k, method='largest')
    clf.fit(X_train_norm)

    train_scores[:, i] = clf.decision_scores_
    test_scores[:, i] = clf.decision_function(X_test_norm)
    print('Base detector %i is fitted for prediction' % i)

Initializing 20 kNN detectors
Base detector 0 is fitted for prediction
Base detector 1 is fitted for prediction
Base detector 2 is fitted for prediction
Base detector 3 is fitted for prediction
Base detector 4 is fitted for prediction
Base detector 5 is fitted for prediction
Base detector 6 is fitted for prediction
Base detector 7 is fitted for prediction
Base detector 8 is fitted for prediction
Base detector 9 is fitted for prediction
Base detector 10 is fitted for prediction
Base detector 11 is fitted for prediction
Base detector 12 is fitted for prediction
Base detector 13 is fitted for prediction
Base detector 14 is fitted for prediction
Base detector 15 is fitted for prediction
Base detector 16 is fitted for prediction
Base detector 17 is fitted for prediction
Base detector 18 is fitted for prediction
Base detector 19 is fitted for prediction


In [6]:
train_scores_norm, test_scores_norm = standardizer(train_scores,
                                                   test_scores)

print('Decision score matrix on training data', train_scores_norm.shape)
print('Decision score matrix on test data', test_scores_norm.shape)

Decision score matrix on training data (600, 20)
Decision score matrix on test data (400, 20)


In [7]:
y_by_average = average(test_scores_norm)
evaluate_print('Combination by Average', y_test, y_by_average)


y_by_maximization = maximization(test_scores_norm)
evaluate_print('Combination by Maximization', y_test, y_by_maximization)


y_by_aom = aom(test_scores_norm, n_buckets=5)
evaluate_print('Combination by AOM', y_test, y_by_aom)


y_by_moa = moa(test_scores_norm, n_buckets=5)
evaluate_print('Combination by MOA', y_test, y_by_moa)

Combination by Average ROC:0.9906, precision @ rank n:0.9783
Combination by Maximization ROC:0.9915, precision @ rank n:0.9783
Combination by AOM ROC:0.9915, precision @ rank n:0.9783
Combination by MOA ROC:0.991, precision @ rank n:0.9783
