In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from torchvision import transforms
import torch
from torch.utils.data import DataLoader, SequentialSampler, WeightedRandomSampler, Dataset
import copy

from utils import *
from tokenization import *
from model import *

In [2]:
path = 'data/{}.csv.gz'
file = 'spirit2_min'
fault_window = 5000

# Preprocess data generate Fault Windows 

In [3]:
# Preprocess data: Generate fault time windows
df = pd.read_csv(path.format(file), nrows=500000)
    
# set datetime and label data range
df['Label'] = np.where(df['t'].values=='-',0,1)
df['datetime'] = pd.to_datetime(df['Timestamp'].astype(int), unit='s')
df = label_data(df, fault_window)
    
# print properties
print(df['Localize_Label'].value_counts())
print(df['Label'].value_counts())
    
# tokenize content
tokenizer_input = df['Content']
tokenizer_filter = r'(\[.+?\]|\(0x.+?\)|\([0-9]\))|:|,|\s+|=|\.|\||\/|\{|\}'
tokenizer = RegexTokenizer(tokenizer_filter, trunc_num=True)
tokenizer.fit(tokenizer_input)
df['tokenized'] = tokenizer.tokenized
    
# save csv
file_preprocessed = file + '_' + str(fault_window)
df.to_csv(path.format(file_preprocessed),compression="gzip")

100%|██████████| 39928/39928 [01:21<00:00, 489.11it/s]
  0%|          | 1331/500000 [00:00<00:37, 13289.67it/s]

1    365444
0    134556
Name: Localize_Label, dtype: int64
0    348620
1    151380
Name: Label, dtype: int64


100%|██████████| 500000/500000 [00:40<00:00, 12440.72it/s]


# Train LogLAB

In [4]:
pad_lengths = {'tbird2_min':20, 'spirit2_min':16, 'BGL.log_min':12}
transform_to_tensor = transforms.Lambda(lambda lst: torch.tensor(lst))

device = 'cuda:0'
lr = 0.0001
batch_size = 128
epochs = 5

In [5]:
pad_len = pad_lengths[file]   
file_preprocessed = file + '_' + str(fault_window)
print(file_preprocessed)
    
df = pd.read_csv(path.format(file_preprocessed), converters={"tokenized": literal_eval})
print(len(df))
    
train_x = df['tokenized']
train_y = df['Localize_Label']
dataset = TextTokenizationDataset(train_x, pad_len, labels=train_y, transforms=transform_to_tensor)
    
# count classes
class_counts = [np.count_nonzero(train_y == 0),np.count_nonzero(train_y == 1)]
weights = 1.0 / (torch.Tensor(class_counts))
weights = [weights[int(v)] for v in train_y]
sampler = WeightedRandomSampler(weights, num_samples=len(df), replacement=False)
data_loader_train = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
    
print('make model')
# make model
vocab_size = get_vocab_size(df['tokenized'])
model = make_model(
    src_vocab=vocab_size,
    tgt_vocab=128,
    n=2,
    d_model=256,
    d_ff=256, 
    dropout=0.1,
    max_len=dataset.max_token_len
)
    
print('train model')
    
# train model
model_opt = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=0.00005)
numerator = (class_counts[0]/(class_counts[0]+class_counts[1]))
numerator = numerator * numerator
print(numerator)

loss_compute = SimpleLossCompute(model.generator, model_opt, is_test=False, numerator=numerator)
train_model(model, data_loader_train, loss_compute, device, epochs=epochs, trained_condition=0.01)
    
# save model
model_name = file_preprocessed + '_' + str(pad_len)
save_model(model, name=model_name)   

spirit2_min_5000
500000


 31%|███       | 154553/500000 [00:00<00:00, 1545527.79it/s]

make model


100%|██████████| 500000/500000 [00:00<00:00, 1540804.48it/s]


train model
0.07242126854400001
Running epoch 1 / 5
Epoch Step: 0 / 3907 Loss: 33020.378906
Epoch Step: 1000 / 3907 Loss: 21017546.607422
Epoch Step: 2000 / 3907 Loss: 30754657.709961
Epoch Step: 3000 / 3907 Loss: 34719310.249634
loss: 9057.126889458888 cond: 0.01
Running epoch 2 / 5
Epoch Step: 0 / 3907 Loss: 7699.666016
Epoch Step: 1000 / 3907 Loss: 4759599.335205
Epoch Step: 2000 / 3907 Loss: 5800008.798828
Epoch Step: 3000 / 3907 Loss: 6065384.238083
loss: 1562.5648838079835 cond: 0.01
Running epoch 3 / 5
Epoch Step: 0 / 3907 Loss: 478.182251
Epoch Step: 1000 / 3907 Loss: 209748.516571
Epoch Step: 2000 / 3907 Loss: 266660.446375
Epoch Step: 3000 / 3907 Loss: 283774.354085
loss: 73.26288064267884 cond: 0.01
Running epoch 4 / 5
Epoch Step: 0 / 3907 Loss: 28.641010
Epoch Step: 1000 / 3907 Loss: 12773.170361
Epoch Step: 2000 / 3907 Loss: 15797.719233
Epoch Step: 3000 / 3907 Loss: 16560.905820
loss: 4.270569631660613 cond: 0.01
Running epoch 5 / 5
Epoch Step: 0 / 3907 Loss: 1.395911
Epo

# Test LogLAB

In [6]:
model_name = file_preprocessed + '_' + str(pad_len)
print(model_name)

df = pd.read_csv(path.format(file_preprocessed), converters={"tokenized": literal_eval})
model = load_model(name=model_name)
    
test_x = df['tokenized']
test_y = df['Label']    
    
transform_to_tensor = transforms.Lambda(lambda lst: torch.tensor(lst))
dataset_test = TextTokenizationDataset(test_x, pad_len, labels=test_y, transforms=transform_to_tensor)
sampler_test = SequentialSampler(dataset_test)
data_loader_test = DataLoader(dataset_test, sampler=sampler_test, batch_size=2048)

model.to(device)
model.eval()
loss_compute = SimpleLossCompute(model.generator, None, is_test=True)
distances = run_classification_test(data_loader_test, model, loss_compute, device=device, step_size=100)
df['distances'] = distances

spirit2_min_5000_16
Epoch Step: 0 / 245 Loss: 0.063675
Epoch Step: 100 / 245 Loss: 7.381588
Epoch Step: 200 / 245 Loss: 7.802961


In [7]:
a = df[df['Label']==1]
n = df[df['Label']==0]

th = (n['distances'].mean() + a['distances'].mean()) / 2


print('Anomalies: {}'.format(len(a)))
print('Abnormal mean: {}'.format(a['distances'].mean()))
print('Normal mean: {}'.format(n['distances'].mean()))
print('Threshold: {}'.format(th))
print('')
    
    
pred_a = df[df['distances'] > th]
pred_n = df[df['distances'] <= th]
vc_a = pred_a['Label'].value_counts()
vc_n = pred_n['Label'].value_counts()
print('True positives and false positives: ')
print(vc_a)
print('True neagtives and false negatives: ')
print(vc_n)
    

df['Pred_Label'] = [int(x > th) for x in df['distances']]
y_test = df['Label']
predicted = df['Pred_Label']
print('F1-Score: {}'.format(metrics.f1_score(y_test, predicted)))
print('Precision: {}'.format(metrics.precision_score(y_test, predicted)))
print('Recall: {}'.format(metrics.recall_score(y_test, predicted)))
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, predicted)))

Anomalies: 151380
Abnormal mean: 111.73679694371835
Normal mean: 3.3296707541189905
Threshold: 57.53323384891867

True positives and false positives: 
1    151244
Name: Label, dtype: int64
True neagtives and false negatives: 
0    348620
1       136
Name: Label, dtype: int64
F1-Score: 0.9995505974410489
Precision: 1.0
Recall: 0.9991015986259744
Accuracy: 0.999728
