In [19]:
import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing
import collections

In [4]:
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict

## Loading HDFS

In [28]:
log_file = '../data/HDFS/syslog_200000.log_structed.csv'
struct_log = pd.read_csv(log_file, engine='c',
                na_filter=False, memory_map=True)

In [29]:
eventBlk = []
eventBlks = {}
for idx, row in struct_log.iterrows():
    eventBlk.append(row['EventId'])
    if (idx+1)%10 == 0:
        eventBlks[(idx+1)//10] = eventBlk
        eventBlk = []

In [33]:
data_df = pd.DataFrame(eventBlks.items(), columns=['BlockId', 'EventSequence'])

In [35]:
def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
    if split_type == 'uniform' and y_data is not None:
        pos_idx = y_data > 0
        x_pos = x_data[pos_idx]
        y_pos = y_data[pos_idx]
        x_neg = x_data[~pos_idx]
        y_neg = y_data[~pos_idx]
        train_pos = int(train_ratio * x_pos.shape[0])
        train_neg = int(train_ratio * x_neg.shape[0])
        x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
        y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
        x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
        y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
    elif split_type == 'sequential':
        num_train = int(train_ratio * x_data.shape[0])
        x_train = x_data[0:num_train]
        x_test = x_data[num_train:]
        if y_data is None:
            y_train = None
            y_test = None
        else:
            y_train = y_data[0:num_train]
            y_test = y_data[num_train:]
    # Random shuffle
    indexes = shuffle(np.arange(x_train.shape[0]))
    x_train = x_train[indexes]
    if y_train is not None:
        y_train = y_train[indexes]
    return (x_train, y_train), (x_test, y_test)

In [37]:
x_data = data_df['EventSequence'].values

train_ratio=0.5
split_type='sequential'
(x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
print('Total: {} instances, train: {} instances, test: {} instances'.format(
      x_data.shape[0], x_train.shape[0], x_test.shape[0]))

Total: 19999 instances, train: 9999 instances, test: 10000 instances


## Invariant Mining

In [38]:
# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)

Train data shape: 9999-by-127



In [40]:
# Model initialization and training
epsilon = 0.5 # threshold for estimating invariant space
model = InvariantsMiner(epsilon=epsilon)
model.fit(x_train)

Invariant space dimension: 119
Mined 119 invariants: {(15, 27): [1.0, -516.0], (3, 65): [-13.0, 1.0], (12, 78): [1.0, -15.0], (5, 112): [-4.0, 1.0], (24, 30): [1.0, -15.0], (21, 97): [1.0, -6.0], (4, 36): [1.0, -1.0], (25, 102): [1.0, -1.0], (25, 99): [1.0, -1.0], (3, 21): [-19.0, 1.0], (21, 70): [1.0, -19.0], (10, 123): [1.0, -615.0], (5, 54): [-2.0, 1.0], (11, 76): [-15.0, 1.0], (24, 82): [1.0, -15.0], (15, 55): [1.0, -172.0], (25, 74): [1.0, -1.0], (24, 64): [1.0, -2.0], (12, 59): [-1.0, 1.0], (12, 64): [1.0, -2.0], (11, 12): [-15.0, 1.0], (12, 103): [1.0, -15.0], (22, 54): [-5.0, 1.0], (5, 18): [1.0, -2.0], (9, 112): [-5.0, 1.0], (24, 47): [1.0, -15.0], (14, 38): [-2.0, 1.0], (11, 52): [-15.0, 1.0], (2, 118): [1.0, -2.0], (28, 97): [1.0, -16.0], (2, 56): [-1.0, 1.0], (4, 10): [-246.0, 1.0], (7, 109): [1.0, -2.0], (21, 26): [1.0, -5.0], (12, 45): [1.0, -15.0], (10, 13): [1.0, -614.0], (12, 25): [-1.0, 1.0], (25, 93): [-1.0, 1.0], (4, 16): [-2.0, 1.0], (17, 98): [1.0, -1.0], (25, 52)

In [41]:
# Predict anomalies on the training set offline, and manually check for correctness
y_train = model.predict(x_train)

In [43]:
# Predict anomalies on the test set to simulate the online mode
# x_test may be loaded from another log file
x_test = feature_extractor.transform(x_test)
y_test = model.predict(x_test)

Test data shape: 10000-by-127



In [52]:
anomalies = []
anomalies_val = []
for i in range(len(y_train)):
    if y_train[i] != 0:
        anomalies.append(i)

for i in range(len(y_test)):
    if y_test[i] != 0:
        anomalies.append(i + len(y_train))

In [54]:
print("Total number of log entries: {} | Total number of anomalies: {}".format(
        10* (len(y_train) + len(y_test)),
        len(anomalies)
        )
    )
print("Index of blocks that contains anomaly: \n{}".format(anomalies))

Total number of log entries: 199990 | Total number of anomalies: 279
Index of blocks that contains anomaly: 
[34, 54, 94, 146, 186, 190, 195, 258, 263, 271, 311, 325, 375, 383, 463, 543, 633, 654, 710, 788, 879, 1001, 1070, 1110, 1167, 1179, 1280, 1334, 1484, 1504, 1522, 1862, 1864, 1933, 1985, 1990, 1999, 2053, 2080, 2107, 2174, 2180, 2219, 2257, 2271, 2316, 2589, 2760, 2799, 2917, 2989, 2992, 3035, 3052, 3081, 3105, 3151, 3267, 3326, 3333, 3429, 3562, 3937, 4001, 4020, 4023, 4028, 4144, 4145, 4148, 4296, 4348, 4388, 4479, 4531, 4585, 4616, 4630, 4735, 4778, 4838, 4855, 4911, 5047, 5198, 5210, 5221, 5348, 5354, 5486, 5519, 5567, 5580, 5656, 5755, 5767, 5776, 5816, 6012, 6173, 6189, 6269, 6397, 6522, 6615, 6669, 6715, 6807, 6881, 6919, 7077, 7097, 7178, 7219, 7226, 7235, 7254, 7383, 7486, 7524, 7574, 7607, 7632, 7637, 7685, 7810, 7823, 7852, 8002, 8096, 8121, 8186, 8252, 8269, 8289, 8299, 8339, 8344, 8441, 8486, 8489, 8518, 8763, 8769, 8815, 8956, 8998, 9039, 9055, 9099, 9335, 9381, 94

### Intepretation of result

Every value above in "anomalies" array is the index of block that has anomaly.

Each block consists of 10 adjacnet log entries.

Index i means there is an anomaly in log entries [i*10, (i+1)*10)