# Agglomerative tuning

## Data Loading and Preprocessing
This notebook was run using AWS Sagemaker

In [1]:
!pip install loglizer

Collecting loglizer
  Downloading loglizer-1.0-py3-none-any.whl (21 kB)
Installing collected packages: loglizer
Successfully installed loglizer-1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd

import sys
sys.path.append('../')
from loglizer.models import LogClustering
from loglizer import dataloader, preprocessing

In [3]:
label_path = 'anomaly_label.csv'
feature_path = 'HDFS_100k.log_structured.csv'

In [4]:
struct_log = feature_path # The structured log file
label_file = label_path # The anomaly label file
max_dist = 0.3 # the threshold to stop the clustering process
anomaly_threshold = 0.3 # the threshold for anomaly detection

In [5]:
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(feature_path,
                                                            label_file=label_path,
                                                            window='session', 
                                                            train_ratio=0.7,
                                                            split_type='uniform')

Loading HDFS_100k.log_structured.csv
219 94
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 5557 instances, 219 anomaly, 5338 normal
Test: 2383 instances, 94 anomaly, 2289 normal



In [6]:
x_train.shape, y_train.shape

((5557,), (5557,))

In [7]:
x_train[0]

['E22',
 'E5',
 'E5',
 'E5',
 'E26',
 'E26',
 'E26',
 'E11',
 'E9',
 'E11',
 'E9',
 'E11',
 'E9']

In [8]:
feature_extractor = preprocessing.FeatureExtractor()
x_train_transformed = feature_extractor.fit_transform(x_train, term_weighting='tf-idf')
x_test_transformed = feature_extractor.transform(x_test)

Train data shape: 5557-by-16

Test data shape: 2383-by-16



In [9]:
x_train_transformed[0]

array([-1.79956050e-12, -5.39868150e-12,  4.78878298e-02,  4.73393336e-02,
        4.78878298e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [10]:
x_test_transformed.shape

(2383, 16)

## Find optimal parameters

In [16]:
max_dist_list = [0.2, 0.3, 0.4] # the threshold to stop the clustering process
anomaly_threshold_list = [0.2, 0.3, 0.4] # the threshold for anomaly detection
results_list = []

In [17]:
for max_dist in max_dist_list:
    for anomaly_threshold in anomaly_threshold_list:
        model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
        model.fit(x_train_transformed[y_train == 0, :]) # Use only normal samples for training
        precision, recall, f1 = model.evaluate(x_test_transformed, y_test)
        
        temp_dict = {}
        temp_dict['max_dist'] = max_dist
        temp_dict['anomaly_threshold'] = anomaly_threshold
        temp_dict['precision'] = precision
        temp_dict['recall'] = recall
        temp_dict['f1'] = f1
        results_list.append(temp_dict)

Starting offline clustering...
Processed 1000 instances.
Found 2 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5338 instances.
Found 3 clusters online.

Precision: 0.950, recall: 0.606, F1-measure: 0.740

Starting offline clustering...
Processed 1000 instances.
Found 2 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5338 instances.
Found 3 clusters online.

Precision: 0.950, recall: 0.606, F1-measure: 0.740

Starting offline clustering...
Processed 1000 instances.
Found 2 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5338 instances.
Found 3 clusters online.

Precision: 0.950, recall: 0.606, F1-measure: 0.740

Starting offline clustering...
Processed 1000 instances.
Found 2 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 4000 instances.
Processed 5338 instances

In [18]:
results_list

[{'max_dist': 0.2,
  'anomaly_threshold': 0.2,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.2,
  'anomaly_threshold': 0.3,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.2,
  'anomaly_threshold': 0.4,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.3,
  'anomaly_threshold': 0.2,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.3,
  'anomaly_threshold': 0.3,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.3,
  'anomaly_threshold': 0.4,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.4,
  'anomaly_threshold': 0.2,
  'precision': 0.95,
  'recall': 0.6063829787234043,
  'f1': 0.7402597402597403},
 {'max_dist': 0.4,
  'anomaly_threshold': 0.3,
  'precision': 0.95,
  'recall': 0.606382978723404