# Train Test Transformer based log outlier detection framework


*   Only suitable for log data that have logical relationship between neighbor logs
    - suitable for : HDFS; Hadoop; Openstack
    - not suitable for: BGL; Thunderbird (no obvious relationship)  


*   Same network architecture, but different embedding size for different log system. Different network objects are separately trained for different log system










# Preparing


*   fix bug of pytorch transformer framework
*   Connect to Gdrive



In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/openstack/functional.py /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py
#after first copy, restart the notebook


try:
    from pytorch_lightning.core.lightning import LightningModule
except ModuleNotFoundError:
    !pip install pytorch-lightning
    from pytorch_lightning.core.lightning import LightningModule

Mounted at /content/drive
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.4.2-py3-none-any.whl (916 kB)
[K     |████████████████████████████████| 916 kB 12.7 MB/s 
Collecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting torchmetrics>=0.4.0
  Downloading torchmetrics-0.5.0-py3-none-any.whl (272 kB)
[K     |████████████████████████████████| 272 kB 67.9 MB/s 
[?25hCollecting PyYAML>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 61.6 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 35.4 MB/s 
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 65.7 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |█████████████████████

In [None]:
# from torch.nn import Transformer
# float 16
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import LightningDataModule
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import Optional
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
import sklearn

class LogOutlier(LightningModule):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.2, padding_value = 512, target_value = 0,lr = 1e-3):
        super().__init__()
        self.lr = lr
        self.ninp = ninp
        self.padding_value = padding_value
        self.target_value = target_value
        # token embedding
        self.encoder = Embeddings(ninp, ntoken)
        # positional encoding
        self.pos_encoder = PositionalEncoding(ninp, dropout)

        # transformer encoder
        encoder_layers = TransformerEncoderLayerAttention(ninp, nhead, nhid, dropout)  # encoder_layer
        self.transformer_encoder = TransformerEncoderAttention(encoder_layers, nlayers)



        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1  # original 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, key_padding_mask, att_mask=None, require_attention=False, result_mask=None):
        # att_mask : for every sequence in batch is the same
        src = src.t()
        # pos encoder suppose input begin with sequence length
        src = self.encoder(src)

        src = self.pos_encoder(src)
        # attention! transformer need different input structure, first place is not batch size

        output, attention_list = self.transformer_encoder(src, src_key_padding_mask=key_padding_mask, mask=att_mask)

        src = None

        # change back to normal shape
        # att_mask  size(batch_size, sequence length mask)
        output = output.permute(1, 0, 2)[result_mask]

        output = self.decoder(output)

        if require_attention:
            return output, attention_list
        else:
            return output

    def training_step(self, batch, batch_idx):
        x, y = batch
        src_key_padding_mask = (x == self.padding_value)
        # mask still on same device as x
        output_mask = (x == self.target_value)
        logits = self(x, key_padding_mask = src_key_padding_mask, result_mask =output_mask )
        loss_function = nn.CrossEntropyLoss()
        loss = loss_function(logits, y)
        self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.lr)
        scheduler = StepLR(optimizer, step_size=30, gamma=0.5)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}


#######################################################
#######################################################

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        # d_model = 0.25*vocab
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
        self.lut.weight.data.uniform_(-0.1, 0.1)

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
    # input must be seq length
    def __init__(self, d_model, dropout=0.1, max_len=3000):
        # max_len does't affect positional encoding, z.B. for sequence with length 30 ,the positional encoding of 3000 and 30 is exactly same  
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerEncoderLayerAttention(TransformerEncoderLayer):
    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) :
            r"""Pass the input through the encoder layer.

            Args:
                src: the sequence to the encoder layer (required).
                src_mask: the mask for the src sequence (optional).
                src_key_padding_mask: the mask for the src keys per batch (optional).

            Shape:
                see the docs in Transformer class.
            """
            src2,attention_weight = self.self_attn(src, src, src, attn_mask=src_mask,
                                key_padding_mask=src_key_padding_mask)
            src = src + self.dropout1(src2)
            src = self.norm1(src)
            src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
            src = src + self.dropout2(src2)
            src = self.norm2(src)
            return src, attention_weight

class TransformerEncoderAttention(TransformerEncoder):
    def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None):
            r"""Pass the input through the encoder layers in turn.

            Args:
                src: the sequence to the encoder (required).
                mask: the mask for the src sequence (optional).
                src_key_padding_mask: the mask for the src keys per batch (optional).

            Shape:
                see the docs in Transformer class.
            """
            output = src
            attention_list = []

            for mod in self.layers:
                output,attention = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
                attention_list.append(attention.to(torch.device('cpu')))
                attention = None
            if self.norm is not None:
                output = self.norm(output)

            return output,attention_list

# Define Network

# Define Dataset Class

In [None]:
import json
from torch.utils.data import Dataset, DataLoader
class sequence_generator(Dataset):
    def __init__(self,target_mask = 0, padding_number = 100, device = torch.device('cpu'), start_token = 62, end_token = 63, need_start_end = False, horizon = False):
        if torch.cuda.is_available():
            device = torch.device('cuda')
        self.tokens = set()
        self.sequence = {}
        self.max_length = 0# key: instance_id value: template id sequences
        self.target_mask = target_mask
        self.padding_number = padding_number
        
        self.seq_data = []
        self.device = device

        self.start_token = start_token
        self.end_token = end_token
        self.need_start_end = need_start_end

        self.horizon = horizon
    
    def single_add(self,instance_id,template_id):
        self.tokens.update([template_id])
        if instance_id not in self.sequence.keys():
            self.sequence[instance_id] = []
        self.sequence[instance_id].append(template_id)
        #update max_length
        self.max_length = max(self.max_length,len(self.sequence[instance_id]))
    
    def list_add(self,instance_id,template_id):
        self.tokens.update(template_id)
        for i in range(len(instance_id)):
            if instance_id[i] not in self.sequence.keys():
                self.sequence[instance_id[i]] = []
            
            self.sequence[instance_id[i]].append(template_id[i])
            
            self.max_length = max(self.max_length,len(self.sequence[instance_id[i]]))
        # add start end
        if self.need_start_end:
            self.max_length += 2
            for key in self.sequence.keys():
                self.sequence[key] = [self.start_token] + self.sequence[key] + [self.end_token]               #add start and end

        self.update_Dataset()
            
    def update_Dataset(self,seqs = None ):
        # horizon
        if self.horizon:
            self.max_length = self.horizon * 2 + 1

        if seqs == None:
            seqs = self.sequence.values()
        self.seq_data = []
        for seq in seqs:
            seq = [int(x) for x in seq]
            for i in range(len(seq)):
                tmp_seq = seq.copy()
                tmp_target = tmp_seq[i]
                tmp_seq[i] = self.target_mask
                #horzion
                if self.horizon:
                    tmp_seq = tmp_seq[max(0,i-self.horizon): i+self.horizon]

                self.seq_data.append((tmp_seq, tmp_target))
            #still a list
    
    def remove_duplicate(self):
        tmp = [json.dumps(x) for x in self.seq_data]
        tmp = set(tmp)
        self.seq_data = [eval(x) for x in tmp]
            
    def padding(self,seq):
        return seq+[self.padding_number]*(self.max_length-len(seq))
    
    #Dataset special methods
    
    def __len__(self):
        return len(self.seq_data)
    
    def __getitem__(self,idx):
        seq = self.seq_data[idx][0]
        seq = self.padding(seq)
        # save memory
        seq = torch.tensor(seq,dtype = torch.long, device=self.device)
        target = torch.tensor(self.seq_data[idx][1],dtype = torch.long, device = self.device)
        return seq, target

# Define Train Function

# Define Outlier score calculation function

*   input is list of template id; 

*   max_length not necessarily to be equal with train; 
*   padding and target but be same to train data





In [None]:
def outlier_detection_single(seq,model,max_length,padding_number,target_mask,top = 10,inspect = False,max_size = 64,horizon = None):
    #prepare data
    seq_matrix = []
    target = []
    seq = [int(x) for x in seq]


    for i in range(len(seq)):
        tmp_seq = seq.copy()
        tmp_target = tmp_seq[i]
        tmp_seq[i] = target_mask

        if horizon:
            tmp_seq = tmp_seq[max(0,i-horizon): i+horizon]
            tmp_seq = padding(tmp_seq, horizon*2+1,padding_number)

        seq_matrix.append(tmp_seq)
        target.append(tmp_target)
    
    #predict
    all_predict = []
    device = next(model.parameters()).device
    while len(seq_matrix)>0:
            X = torch.tensor(seq_matrix[:max_size],device = device)
            seq_matrix = seq_matrix[max_size:]
            #predict
            with torch.no_grad():
                model.eval()
                key_padding_mask = (X == padding_number)
                output_mask = (X==target_mask)
                output = model(X, key_padding_mask = key_padding_mask, result_mask =output_mask )
                predict = torch.topk(output,top,dim = 1).indices
                all_predict.extend(predict.tolist())
    
    #outlier score all_predict target
    return all_predict, target

def padding(seq,max_length,padding_number):
    return seq+[padding_number]*(max_length-len(seq))

In [None]:
def outlier_score_calculation(all_predict, target ,top=1,relative = False, false_position = False):
    score = 0
    if false_position:
        p = []
    for i in range(len(target)):
        if target[i] not in all_predict[i][:top+1]:
            score+=1
            if false_position:
                p.append(i)
    torch.cuda.empty_cache()
    if relative:
        return score/len(target)
    else:
        if false_position:
            return score,p
        else:
            return score

# load data
**hadoop** padding 264 target 0 start 265 end 266 token totally 267

In [None]:
# load data
import re
import pandas as pd
#data
hadoop = pd.read_csv('/content/drive/MyDrive/hadoop/hadoop.csv')
hadoop['app_id'] = hadoop.Label.str.split('#@#').str[0] 
### remove master node logs
###master_nodes = list(hadoop[hadoop.Content.str.contains('Created MRAppMaster for application')].Label.unique())
###hadoop = hadoop[~hadoop.Label.isin(master_nodes)].reset_index(drop = True) 

# only keep master nodes
master_nodes = list(hadoop[hadoop.Content.str.contains('Created MRAppMaster for application')].Label.unique())
hadoop = hadoop[hadoop.Label.isin(master_nodes)].reset_index(drop = True)

#label
label = []
with open('/content/drive/MyDrive/hadoop/abnormal_label.txt','r') as data:
    for line in data:
        if re.search('application_[a-zA-Z0-9_]+',line):
            label.append(re.search('application_[a-zA-Z0-9_]+',line).group())
        else:
            label.append(line)

label_normal = label[4:7]+label[38:46]
label_normal = [i for i in label_normal if re.search(r'application_[0-9_]*',i)]

label_abnormal = [x for x in label if x not in label_normal]
label_abnormal = [i for i in label_abnormal if re.search(r'application_[0-9_]*',i)]

In [None]:
len(label_abnormal)

44

In [None]:
len(final_test)

49

In [None]:
# train val split
train_set = label[4:6] + label[42:46]
#train_set =  label[42:44]


val_set = [label[6]] + [label[39]]

final_test =label_abnormal + [x for x in label_normal if x not in train_set]

In [None]:
train_df = hadoop[hadoop.app_id.isin(train_set)].reset_index(drop = True)
val_df = hadoop[hadoop.app_id.isin(val_set)].reset_index(drop = True)
test_df = hadoop[hadoop.app_id.isin(final_test)].reset_index(drop = True)
print(len(hadoop.Template_id.unique()))
print(min(hadoop.Template_id.unique()))
#padding 263+1

191
1


In [None]:
horizon = 20 #30
train_sg = sequence_generator(target_mask = 0, padding_number = 264,start_token=265,end_token=266,need_start_end=True,horizon=horizon)
val_sg = sequence_generator(target_mask = 0, padding_number = 264,start_token=265,end_token=266,need_start_end=True,horizon=horizon)
test_sg = sequence_generator(target_mask = 0, padding_number = 264,start_token=265,end_token=266,need_start_end=True,horizon=horizon)

# generate sub sequence dataset
train_sg.list_add(train_df['Label'],train_df['Template_id'])
val_sg.list_add(val_df['Label'],val_df['Template_id'])
test_sg.list_add(test_df['Label'],test_df['Template_id'])

train_sg.update_Dataset()
val_sg.update_Dataset()
test_sg.update_Dataset()

In [None]:
from pytorch_lightning import Trainer, seed_everything
seed_everything(66, workers=True)
model = LogOutlier(ntoken = 267, ninp = 64, nhead = 2, nhid = 64 , nlayers = 4, dropout=0.2, padding_value= 264, target_value= 0)
#PATH = '/content/drive/MyDrive/hadoop'+'loss=0.163'+'(ntoken = 267, ninp = 64, nhead = 2, nhid = 64 , nlayers = 4, dropout=0.2, padding_value= 264, target_value= 0)'+'.h5'
#model.load_state_dict(torch.load(PATH))
#model.eval()

Global seed set to 66


In [None]:
train_dataloader = DataLoader(train_sg,256)

In [None]:
from pytorch_lightning import Trainer
trainer = Trainer(precision=16,gpus=1,max_epochs = 60,deterministic=True)

trainer.fit(model, train_dataloader)
#trainer.fit(model,dataloader)

Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type                        | Params
--------------------------------------------------------------------
0 | encoder             | Embeddings                  | 17.1 K
1 | pos_encoder         | PositionalEncoding          | 0     
2 | transformer_encoder | TransformerEncoderAttention | 100 K 
3 | decoder             | Linear                      | 17.4 K
--------------------------------------------------------------------
135 K     Trainable params
0         Non-trainable params
135 K     Total params
0.541     Total estimated model params size (MB)
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

# Evaluate

In [None]:
test = []
keys = []
for i in test_sg.sequence.keys():
        keys.append(i)
        test.append(test_sg.sequence[i])
    #else:
        #ground_truth.append(0)
#ground_truth = [0]*len(keys)

from tqdm.notebook import tqdm
model.to(torch.device('cuda'))
hdfs_output = []
for i in tqdm(test):
    hdfs_output.append(outlier_detection_single(i,model,41,264,0,top=10,horizon=horizon)) #2 is best for hdfs



  0%|          | 0/62 [00:00<?, ?it/s]

In [None]:
def with_different_threshold(hdfs_output,threshold = 1,need_plot = False,false_position = True):    
    hdfs_predict = []
    position_list = []

    ############### prediction with model ####################################
    for all_predict, target in tqdm(hdfs_output):
        if not false_position:
            # don't need to know the position of false prediction in log sentence
            hdfs_predict.append(outlier_score_calculation(all_predict, target ,top=threshold,relative=False,false_position=false_position))

        else:
            # need to know 
            score,position_ = outlier_score_calculation(all_predict, target ,top=threshold,relative=False,false_position=false_position)
            hdfs_predict.append(score)
            position_list.append(position_)

    ############### evaluation and plot ######################################
    from sklearn.metrics import precision_recall_curve
    precision, recall, thresholds = precision_recall_curve(ground_truth, hdfs_predict)
    if need_plot:
        import matplotlib.pyplot as plt
        import seaborn as sns
        plt.figure(figsize=(10,10))
        ax = sns.lineplot(x = recall, y= precision)
        ax.set(xlabel='recall', ylabel='precision')
        ax.set(ylim=(0,1))
        ax.set(xlim=(0,1))
        plt.show()

    ############### best precision, recall, f1 score #########################
    f1 = 0
    p = 0
    r = 0
    for i in range(len(precision)):
        f1_ = 2 * precision[i] * recall[i]/(precision[i]+recall[i])
        if f1_>f1:
            f1 = f1_
            p = precision[i]
            r = recall[i]
    print(p,r,f1)
    return (threshold,precision,recall,thresholds,ground_truth,hdfs_predict,position_list), f1

In [None]:
keys = [i.split('#@#')[0] for i in keys]

In [None]:
ground_truth = []
for i in keys:
    if i in label_abnormal:
        ground_truth.append(1)
    else:
        ground_truth.append(0)

In [None]:
out = with_different_threshold(hdfs_output,3,need_plot= False)

  0%|          | 0/62 [00:00<?, ?it/s]

0.95 1.0 0.9743589743589743


In [None]:
p = []
for i in out[0][-2]:
    if i>-1:
        p.append(1)
    else:
        p.append(0)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
precision_recall_fscore_support(ground_truth,p)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.        , 0.91935484]),
 array([0., 1.]),
 array([0.        , 0.95798319]),
 array([ 5, 57]))

In [None]:
out[0][-2]

[32,
 1,
 3,
 0,
 445,
 40,
 4,
 32,
 10,
 15,
 2,
 13,
 102,
 9,
 20,
 23,
 2,
 2,
 10,
 215,
 184,
 6,
 5475,
 9,
 3332,
 12,
 29,
 4,
 1,
 58,
 78,
 666,
 10,
 10,
 14,
 8,
 7,
 60,
 43,
 46,
 2,
 10,
 1,
 22,
 48,
 8,
 14,
 2,
 16,
 30,
 3,
 0,
 133,
 74,
 2,
 4,
 26,
 28,
 15,
 10,
 3352,
 11]