In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from loglizer.models import PCA, IsolationForest, LogClustering, OneClassSVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [2]:
ouput_dir = "../output/hdfs/"
(x_train, y_train), (x_test, y_test) = dataloader.load_data(data_dir=ouput_dir)
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)


  train = np.array(train).reshape(-1,1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


Train normal size: 4855
Train abnormal size: 2427
Test normal size: 553368
Test abnormal size: 14411
Train data shape: 7282-by-39

Test data shape: 567779-by-39



In [3]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 2
Project matrix shape: 39-by-39
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 2427, FP: 2045, TN: 2810, FN: 0
Precision: 54.271%, recall: 100.000%, F1-measure: 70.358%

Test validation:
Confusion Matrix: TP: 14411, FP: 230619, TN: 322749, FN: 0
Precision: 5.881%, recall: 100.000%, F1-measure: 11.109%

CPU times: user 1.25 s, sys: 2.3 ms, total: 1.25 s
Wall time: 1.25 s


In [4]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 1682, FP: 33, TN: 4822, FN: 745
Precision: 98.076, recall: 69.304, F1-measure: 81.217

Test validation:
Confusion Matrix: TP: 9905, FP: 3207, TN: 550161, FN: 4506
Precision: 75.541, recall: 68.732, F1-measure: 71.976

CPU times: user 7.62 s, sys: 1.25 s, total: 8.87 s
Wall time: 8.87 s


In [5]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 454, FP: 4855, TN: 0, FN: 1973
Precision: 8.552, recall: 18.706, F1-measure: 11.737

Test validation:
Confusion Matrix: TP: 2784, FP: 553368, TN: 0, FN: 11627
Precision: 0.501, recall: 19.319, F1-measure: 0.976

CPU times: user 1min 17s, sys: 34.9 ms, total: 1min 17s
Wall time: 1min 17s


In [6]:
# %%time
# print("="*20 + " Model: one class SVM " + "="*20)

# nus = [0.001, 0.01, 0.1, 1]
# gammas = [0.001, 0.01, 0.1, 1]
# tuned_parameters = {'kernel' : ['rbf','poly','linear','sigmoid'], 'gamma' : gammas, 'nu': nus}

# ocsvm = svm.OneClassSVM()
# model = GridSearchCV(ocsvm, tuned_parameters, cv=5, scoring="f1_micro")

# model.fit(x_train, y_train.astype(int))

# # print('Train validation:')
# # precision, recall, f1 = model.predict(x_train, y_train.astype(int))
# # print('Test validation:')
# # precision, recall, f1 = model.predict(x_test, y_test.astype(int))

In [7]:
# print('Train validation:')
# y_eval = model.predict(x_train)
# precision, recall, f1 = metrics(y_eval, y_train)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
    
# print('Test validation:')
# y_pred = model.predict(x_test)
# precision, recall, f1 = metrics(y_pred, y_test)
# print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))


In [8]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)




Starting offline clustering...
Processed 1000 instances.
Found 4 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 4855 instances.
Found 4 clusters online.

Train validation:
Confusion Matrix: TP: 937, FP: 0, TN: 4855, FN: 1490
Precision: 100.000, recall: 38.607, F1-measure: 55.708

Test validation:
Confusion Matrix: TP: 5274, FP: 40, TN: 553328, FN: 9137
Precision: 99.247, recall: 36.597, F1-measure: 53.475

CPU times: user 18 s, sys: 0 ns, total: 18 s
Wall time: 18 s
