In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# input
csv_dir = "/content/drive/MyDrive/Colab Notebooks/Drain_result/"
dataset_prefix = "thunderbird_10M."
csv_structured = csv_dir + dataset_prefix + "log_ident_structured.csv"
csv_template = csv_dir + dataset_prefix + "log_ident_templates.csv"

test_portion = 0.6

In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer

import os
from tqdm.auto import tqdm
tqdm.pandas()

import re
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)

In [4]:
def define_dtype(df, dtype_map, copy=False):
  if copy: df = df.copy()
  
  for col, dtype in dtype_map.items():
    if col not in df.columns: continue
    if dtype.startswith("datetime_format_"):
      dtype, format = dtype.split("_format_")
      df[col] = pd.to_datetime(df[col], format=format)
    elif dtype == "_drop_": df = df.drop(columns=[col])
    else: df[col] = df[col].astype(dtype)
  
  return df

In [5]:
struct_log = pd.read_csv(csv_structured)
template_log = pd.read_csv(csv_template)

In [6]:
struct_log["Content"].sample(10)

5135240    jAIFpXvU024714: from=root, size=1541310, class...
1125753                         file-max limit 65536 reached
2034180    Instrumentation Service EventID: 1052 Temperat...
2720846          [ib_sm_sweep.c:1264]: Rediscover the subnet
9372604                          Got trap from peer on fd 13
8702341    Instrumentation Service EventID: 1052 Temperat...
4288613    unable to qualify my own domain name (aadmin1)...
6232578    [KERNEL_IB][tsIbTavorMadProcess][/mnt_projects...
4473673    Instrumentation Service EventID: 1053 Temperat...
6861009    Cannot open file /var/log/syslog-ng/an21/2005....
Name: Content, dtype: object

In [7]:
struct_log[["Label", "Timestamp", "Date", "User", "Month", "Day", "Time", "Location", "Component", "PID", "Content"]]

Unnamed: 0,Label,Timestamp,Date,User,Month,Day,Time,Location,Component,PID,Content
0,-,1131523501,2005.11.09,aadmin1,Nov,10,00:05:01,src@aadmin1,in.tftpd,14620.0,tftp: client does not accept options
1,-,1131524071,2005.11.09,tbird-admin1,Nov,10,00:14:31,local@tbird-admin1,postfix/postdrop,10896.0,warning: unable to look up public/pickup: No s...
2,-,1131524073,2005.11.09,tbird-admin1,Nov,10,00:14:33,local@tbird-admin1,postfix/postdrop,10900.0,warning: unable to look up public/pickup: No s...
3,-,1131524106,2005.11.09,tbird-admin1,Nov,10,00:15:06,local@tbird-admin1,postfix/postdrop,10910.0,warning: unable to look up public/pickup: No s...
4,-,1131524107,2005.11.09,tbird-admin1,Nov,10,00:15:07,local@tbird-admin1,postfix/postdrop,10913.0,warning: unable to look up public/pickup: No s...
...,...,...,...,...,...,...,...,...,...,...,...
9975127,-,1133756956,2005.12.04,bn171,Dec,4,20:29:16,bn171/bn171,snmpd,7944.0,Got trap from peer on fd 13
9975128,-,1133756956,2005.12.04,bn190,Dec,4,20:29:16,bn190/bn190,Server Administrator,,Instrumentation Service EventID: 1052 Temperat...
9975129,-,1133756956,2005.12.04,bn190,Dec,4,20:29:16,bn190/bn190,snmpd,7941.0,Got trap from peer on fd 13
9975130,-,1133756956,2005.12.04,dn536,Dec,4,20:29:16,dn536/dn536,ntpd,2283.0,"synchronized to 10.100.30.250, stratum 3"


In [8]:
struct_dtype = {
    "LineId": "int64",
    "Label": "category",
    "Timestamp": "int64",
    "Date": "datetime64",
    "User": "string",
    "Month": "category",
    "Day": "int8",
    "Time": "datetime_format_%H:%M:%S",
    "Location": "string",
    "PID": "string",
    "Content": "string",
    "EventId": "string",
    "EventTemplate": "string",
    "EventTemplateIdent": "string",
    "ParameterList": "object",
}

template_dtype = {
    "EventId": "string",
    "EventTemplateIdent": "string",
    "Occurrences": "int64"
}

In [9]:
struct_log = define_dtype(struct_log, struct_dtype)
template_log = define_dtype(template_log, template_dtype)

In [10]:
trainset, testset = train_test_split(struct_log, test_size=test_portion, random_state=seed, shuffle=False)

In [11]:
train_eventId = trainset["EventId"]
test_eventId = testset["EventId"]

train_template = template_log.query("EventId in @train_eventId")
test_template = template_log.query("EventId in @test_eventId")

In [12]:
unique_trainset = pd.DataFrame(
    trainset["Content"].unique(),
    columns=["Content"]
    )

In [13]:
tokenizer = RegexpTokenizer(r'[^\s[\](),=<>{}+*]+')
unique_trainset["Tokens"] = unique_trainset["Content"].progress_apply(tokenizer.tokenize)

  0%|          | 0/372757 [00:00<?, ?it/s]

In [14]:
import tensorflow as tf
from zlib import crc32

class DenoiseAutoEncoder(tf.keras.Model):

  def __init__(self, regex_features, kernel_size, batch_size, padding_token="<PAD>", **kwargs):
    super(DenoiseAutoEncoder, self).__init__(**kwargs)
    self.batch_size = batch_size
    self.regex_features = regex_features
    self.n_features = len(self.regex_features) + 2
    self.features = None
    self.kernel_size = (kernel_size, )
    self.min_length = kernel_size
    self.pad_token = self.string_to_float_crc32(padding_token)

    self.encoder = tf.keras.Sequential([
        tf.keras.layers.Input(
            shape=(None, self.n_features),
            name="input_layer",
            dtype="float64",
        ),
        tf.keras.layers.Conv1D(
            filters=8,
            kernel_size=self.kernel_size,
            strides=1,
            padding="same",
            activation="relu",
            dtype="float64",
        ),
        tf.keras.layers.Conv1D(
            filters=4,
            kernel_size=self.kernel_size,
            strides=1,
            padding="same",
            activation="relu",
            dtype="float64",
        )
    ])

    self.decoder = tf.keras.Sequential([
        tf.keras.layers.Conv1DTranspose(
            filters=4,
            kernel_size=self.kernel_size,
            strides=1,
            padding='same',
            activation="relu",
            dtype="float64",
        ),
        tf.keras.layers.Conv1DTranspose(
            filters=8,
            kernel_size=self.kernel_size,
            strides=1,
            padding='same',
            activation="relu",
            dtype="float64",
        ),
        tf.keras.layers.Conv1D(
            filters=1,
            kernel_size=self.kernel_size,
            strides=1,
            activation="sigmoid",
            padding='same',
            dtype="float64",
        ),
        tf.keras.layers.Reshape((-1,), dtype="float64",)
    ])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

  def fit_vocab(self, X):
    vocab = [ token for row in X for token in row ]
    vocab.append(self.pad_token)
    self.vocab = np.unique(vocab)

    regex_features = np.ones(
        shape=(len(self.vocab), len(self.regex_features)), 
        dtype="float64"
        )
    hash_features = np.ones(shape=(len(self.vocab),), dtype="float64")
    len_features = np.ones(shape=(len(self.vocab),), dtype="uint8")
    for i, token in tqdm(
        enumerate(self.vocab), 
        desc="extract features",
        total=len(self.vocab)
        ):
      for j, regex in enumerate(self.regex_features):
        regex_features[i][j] = self.extract_regex_percentage(token, regex)
      
      hash_features[i] = self.string_to_float_crc32(token)
      len_features[i] = len(token)

    self.mean_len_token = len_features.mean()
    self.std_len_token = len_features.std()
    norm_len_features = (
        ( len_features - self.mean_len_token ) / ( self.std_len_token )
        ).astype("float64")

    self.features = pd.DataFrame(
        regex_features, 
        index=self.vocab, 
        columns=self.regex_features
    )
    self.features["crc32"] = hash_features
    self.features["norm_len"] =  norm_len_features

  def pad(self, X):
    return [ 
        ( seq if len(seq) >= self.min_length \
          else seq.extend([self.pad_token for _ in range(self.min_length - len(seq))]) )\
        for seq in tqdm(X, desc="padding sequence") ]

  def asstr(self, X):
    return [ str(x) for x in X]

  def transform_features(self, X):
    assert not self.features is None, \
      "require to call fit_vocab() before extract_features"

    Y = [ self.extract_features(seq) \
            for seq in tqdm(X, desc="extract feature") ]
    return Y

  def extract_label(self, X):
    Y = [ [ self.string_to_float_crc32(token) for token in seq ] \
              for seq in tqdm(X, desc="extract feature") ]
    return Y

  def extract_features(self, x):
    assert not self.features is None, \
      "require to call fit_vocab() before extract_features"

    _features = np.zeros(shape=(len(x), self.n_features), dtype="float64")
    for i, token in enumerate(x):
      for j, regex in enumerate(self.regex_features):
        _features[i][j] = self.extract_regex_percentage(token, regex)
      
      _features[i][-2] = self.string_to_float_crc32(token)
      _features[i][-1] = (
          (len(token) - self.mean_len_token) / ( self.std_len_token )
          ).astype("float64")
    
    return _features

  @staticmethod
  def extract_regex_percentage(token, regex):
    found = re.findall(regex, token)
    count = sum([ len(m) for m in found ])
    return count / len(token)

  @staticmethod
  def string_to_float_crc32(text, encoding="utf-8"):
    return float(crc32(text.encode(encoding=encoding))) / 2**32

In [37]:
denoiser = DenoiseAutoEncoder(
    regex_features={
        "[0-9]+",
        "[a-fA-F0-9]+",
        "[A-F]+",
        "[a-f]+",
        # "[\\/]+",
        # "[:]+",
        # "[.]+",
        # "[^A-Za-z0-9]+",
    },
    kernel_size = 3,
    batch_size = 1
)

In [38]:
denoiser.compile(
    optimizer='adam', 
    loss=tf.losses.MeanAbsoluteError()
    )

In [39]:
denoiser.fit_vocab(unique_trainset["Tokens"])

extract features:   0%|          | 0/120638 [00:00<?, ?it/s]

In [40]:
unique_trainset["Tokens_pad"] = pd.Series(denoiser.pad(unique_trainset["Tokens"]))
unique_trainset["Tokens_pad"] = unique_trainset["Tokens_pad"].map(denoiser.asstr)

padding sequence:   0%|          | 0/372757 [00:00<?, ?it/s]

In [41]:
train_X = denoiser.transform_features(unique_trainset["Tokens_pad"])
train_Y = denoiser.extract_label(unique_trainset["Tokens_pad"])

extract feature:   0%|          | 0/372757 [00:00<?, ?it/s]

extract feature:   0%|          | 0/372757 [00:00<?, ?it/s]

In [42]:
def gen_series(X, y):
  assert len(X) == len(y), "length of X and y must equal."
  for i in range(len(X)):
    yield X[i], y[i]

In [49]:
train_set = tf.data.Dataset.from_generator(
    lambda: gen_series(train_X, train_Y),
    output_signature=(
        tf.TensorSpec(
            shape=(None, denoiser.n_features), 
            dtype="float64",
            name="features"
            ),
        tf.TensorSpec(
            shape=(None,), 
            dtype="float64",
            name="labels"
            )
        )
)
train_set = train_set.apply( tf.data.experimental.assert_cardinality(len(train_X)) )

In [44]:
train_batch = train_set.padded_batch(denoiser.batch_size)

In [45]:
denoiser.build(input_shape=(denoiser.batch_size, None, denoiser.n_features))

In [46]:
denoiser.summary()

Model: "denoise_auto_encoder_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, None, 4)           252       
                                                                 
 sequential_3 (Sequential)   (1, None)                 181       
                                                                 
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________


In [50]:
for x, y in train_batch:
  print(x.shape, y.shape)
  y_hat = denoiser(x)
  print(y_hat.shape)
  break

(1, 6, 6) (1, 6)
(1, 6)


In [55]:
denoiser.fit(train_batch, epochs=1)



<keras.callbacks.History at 0x7f83acdf43d0>

In [56]:
pred = denoiser.predict(train_batch)



In [79]:
# with np.printoptions(precision=4, suppress=True):
for i, data in enumerate(train_batch):
  text = " ".join([ f"{t}" for t in unique_trainset["Tokens"].iloc[i] ]) 
  if text.endswith("via eth1"):
    print("text:", text)
    # print(data[1])
    # print(pred[i])
    print("diff:", np.abs((data[1] - pred[i]).numpy()))

text: DHCPDISCOVER from 00:11:43:32:c6:8c via eth1
diff: [[0.00051104 0.00154028 0.00421988 0.00758549 0.00019431]]
text: DHCPOFFER on 10.100.19.66 to 00:11:43:32:c6:8c via eth1
diff: [[0.02180106 0.01278703 0.00758098 0.00072842 0.00283269 0.00725931
  0.00024634]]
text: DHCPACK on 10.100.19.66 to 00:11:43:32:c6:8c via eth1
diff: [[0.03344546 0.00138985 0.00999473 0.00029888 0.00278158 0.00726095
  0.00024634]]
text: DHCPREQUEST for 10.100.19.66 10.100.18.250 from 00:11:43:32:c6:8c via eth1
diff: [[0.00503027 0.00240654 0.00938366 0.00453446 0.0038987  0.00071356
  0.00717281 0.0001212 ]]
text: DHCPDISCOVER from 00:11:43:e3:ba:c3 via eth1
diff: [[0.00112634 0.00305492 0.00526568 0.00885068 0.00032893]]
text: DHCPOFFER on 10.100.4.251 to 00:11:43:e3:ba:c3 via eth1
diff: [[0.02125235 0.01881509 0.00770164 0.0035928  0.00555927 0.0082173
  0.00033217]]
text: DHCPACK on 10.100.4.251 to 00:11:43:e3:ba:c3 via eth1
diff: [[0.03438461 0.00429102 0.00443322 0.00408246 0.00556284 0.00821905
  0

KeyboardInterrupt: ignored

In [None]:
trainset[['Content', 'Tokens']]

Unnamed: 0,Content,Tokens
0,tftp: client does not accept options,"[tftp:, client, does, not, accept, options]"
1,warning: unable to look up public/pickup: No s...,"[warning:, unable, to, look, up, public/pickup..."
2,warning: unable to look up public/pickup: No s...,"[warning:, unable, to, look, up, public/pickup..."
3,warning: unable to look up public/pickup: No s...,"[warning:, unable, to, look, up, public/pickup..."
4,warning: unable to look up public/pickup: No s...,"[warning:, unable, to, look, up, public/pickup..."
...,...,...
3990047,(root) CMD (/projects/tbird/temps/get_temps e),"[root, CMD, /projects/tbird/temps/get_temps, e]"
3990048,data_thread() got not answer from any [Thunder...,"[data_thread, got, not, answer, from, any, Thu..."
3990049,Instrumentation Service EventID: 1053 Temperat...,"[Instrumentation, Service, EventID:, 1053, Tem..."
3990050,Got trap from peer on fd 13,"[Got, trap, from, peer, on, fd, 13]"


