### https://github.com/logpai/loglizer?tab=readme-ov-file

In [1]:
import numpy as np
import pandas as pd
import re
from collections import OrderedDict
from collections import Counter
from PIL import Image

In [2]:
def collect_event_ids(data_frame, regex_pattern, column_names):
    """
    turns input data_frame into a 2 columned dataframe
    with columns: BlockId, EventSequence
    where EventSequence is a list of the events that happened to the block
    """
    data_dict = OrderedDict()
    for _, row in data_frame.iterrows():
        blk_id_list = re.findall(regex_pattern, row["Content"])
        blk_id_set = set(blk_id_list)
        for blk_id in blk_id_set:
            if blk_id not in data_dict:
                data_dict[blk_id] = []
            data_dict[blk_id].append(row["EventId"])
    data_df = pd.DataFrame(list(data_dict.items()), columns=column_names)
    return data_df

In [3]:
def windower(sequence, window_size):
    """
    creates an array of arrays of windows
    output array is of length: len(sequence) - window_size + 1
    """
    return np.lib.stride_tricks.sliding_window_view(sequence, window_size)

In [4]:
def sequence_padder(sequence, required_length):
    """
    right pads events sequence until max sequence length long
    """
    if len(sequence) > required_length:
        return sequence
    return np.pad(
        sequence,
        (0, required_length - len(sequence)),
        mode="constant",
        constant_values=(0),
    )

In [5]:
def resize_time_image(time_image, size):
    """
    compresses time images that had more sequences than the set max sequence length
    """
    width = size[1]
    height = size[0]
    return np.array(Image.fromarray(time_image).resize((width, height)))

In [6]:
class FeatureExtractor(object):
    """
    class for fitting and transforming the training set
    then transforming the testing set
    """

    def __init__(self):
        self.mean_vec = None
        self.idf_vec = None
        self.events = None
        self.term_weighting = None
        self.max_seq_length = None
        self.window_size = None
        self.num_rows = None

    def fit_transform(
        self, X_seq, term_weighting=None, length_percentile=90, window_size=16
    ):
        """
        Fit and transform the training set
        X_Seq: ndarray,  log sequences matrix
        term_weighting: None or `tf-idf`
        length_percentile: int, set the max length of the event sequences
        window_size: int, size of subsetting
        """
        self.term_weighting = term_weighting
        self.window_size = window_size

        # get unique events
        self.events = list(set(np.concatenate(X_seq).ravel().flatten()))

        # get lengths of event sequences
        length_list = np.array(list(map(len, X_seq)))
        self.max_seq_length = int(np.percentile(length_list, length_percentile))

        self.num_rows = self.max_seq_length - self.window_size + 1

        print("final shape will be ", self.num_rows, len(self.events))

        # loop over each sequence to create the time image
        time_images = []
        for block in X_seq:
            padded_block = sequence_padder(block, self.max_seq_length)
            time_image = windower(padded_block, self.window_size)
            time_image_counts = []
            for time_row in time_image:
                row_count = Counter(time_row)
                time_image_counts.append(row_count)

            time_image_df = pd.DataFrame(time_image_counts, columns=self.events)
            time_image_df = time_image_df.reindex(sorted(time_image_df.columns), axis=1)
            time_image_df = time_image_df.fillna(0)
            time_image_np = time_image_df.to_numpy()

            # resize if too large
            if len(time_image_np) > self.num_rows:
                time_image_np = resize_time_image(
                    time_image_np, (self.num_rows, len(self.events)),
                )

            time_images.append(time_image_np)

        # stack all the blocks
        X = np.stack(time_images)

        if self.term_weighting == "tf-idf":

            # set up sizing
            dim1, dim2, dim3 = X.shape
            X = X.reshape(-1, dim3)

            # apply tf-idf
            df_vec = np.sum(X > 0, axis=0)
            self.idf_vec = np.log(dim1 / (df_vec + 1e-8))
            idf_tile = np.tile(self.idf_vec, (dim1 * dim2, 1))
            idf_matrix = X * idf_tile
            X = idf_matrix

            # reshape to original dimensions
            X = X.reshape(dim1, dim2, dim3)

        X_new = X
        print("train data shape: ", X_new.shape)
        return X_new

    def transform(self, X_seq):
        """
        transforms x test
        X_seq : log sequence data
        """

        # loop over each sequence to create the time image
        time_images = []
        for block in X_seq:
            padded_block = sequence_padder(block, self.max_seq_length)
            time_image = windower(padded_block, self.window_size)
            time_image_counts = []
            for time_row in time_image:
                row_count = Counter(time_row)
                time_image_counts.append(row_count)

            time_image_df = pd.DataFrame(time_image_counts, columns=self.events)
            time_image_df = time_image_df.reindex(sorted(time_image_df.columns), axis=1)
            time_image_df = time_image_df.fillna(0)
            time_image_np = time_image_df.to_numpy()

            # resize if too large
            if len(time_image_np) > self.num_rows:
                time_image_np = resize_time_image(
                    time_image_np, (self.num_rows, len(self.events)),
                )

            time_images.append(time_image_np)

        # stack all the blocks
        X = np.stack(time_images)

        if self.term_weighting == "tf-idf":

            # set up sizing
            dim1, dim2, dim3 = X.shape
            X = X.reshape(-1, dim3)

            # apply tf-idf
            idf_tile = np.tile(self.idf_vec, (dim1 * dim2, 1))
            idf_matrix = X * idf_tile
            X = idf_matrix

            # reshape to original dimensions
            X = X.reshape(dim1, dim2, dim3)

        X_new = X
        print("test data shape: ", X_new.shape)
        return X_new


In [7]:
import numpy as np
import pandas as pd
import time

if __name__ == "__main__":

    data_version = "_v5"

    data_version = "_tf-idf{}".format(data_version)

    # where the "raw" data for this file is located
    load_data_location = "./project_processed_data/"

    # where the processed data is saved
    save_location = "./project_processed_data/{}/".format(data_version)

    start = time.time()

    # Loads data
    print("loading x_train")
    x_train = pd.read_csv("{}HDFS_train.log_structured.csv".format(load_data_location))

    print("loading x_test")
    x_test = pd.read_csv("{}HDFS_test.log_structured.csv".format(load_data_location))

    print("loading y")
    y = pd.read_csv("{}anomaly_label.csv".format(load_data_location))

    # processes events into blocks
    re_pat = r"blk_-[0-9]*"
    col_names = ["BlockId", "EventSequence"]

    print("collecting events for x_train")
    events_train = collect_event_ids(x_train, re_pat, col_names)
    print("collecting events for x_test")
    events_test = collect_event_ids(x_test, re_pat, col_names)

    print("merging block frames with labels")
    events_train = events_train.merge(y, on="BlockId")
    events_test = events_test.merge(y, on="BlockId")

    print("removing blocks that are overlapped into train and test")
    overlapping_blocks = np.intersect1d(events_train["BlockId"], events_test["BlockId"])
    events_train = events_train[~events_train["BlockId"].isin(overlapping_blocks)]
    events_test = events_test[~events_test["BlockId"].isin(overlapping_blocks)]

    events_train_values = events_train["EventSequence"].values
    events_test_values = events_test["EventSequence"].values

    

loading x_train
loading x_test
loading y
collecting events for x_train
collecting events for x_test
merging block frames with labels
removing blocks that are overlapped into train and test


In [58]:
# fit transform & transform
fe = FeatureExtractor()

print("fit_transform x_train")
subblocks_train = fe.fit_transform(
    events_train_values,
    term_weighting="tf-idf",
    length_percentile=95,
    window_size=16,
)

print("transform x_test")
subblocks_test = fe.transform(events_test_values)

print("collecting y data")
y_train = events_train[["BlockId", "Label"]]
y_test = events_test[["BlockId", "Label"]]

fit_transform x_train
final shape will be  16 43
train data shape:  (197301, 16, 43)
transform x_test
test data shape:  (60336, 16, 43)
collecting y data


In [16]:
# saving files
print("writing y to csv")
y_train.to_csv("{}y_train{}.csv".format(save_location, data_version))
y_test.to_csv("{}y_test{}.csv".format(save_location, data_version))

print("saving x to numpy object")
np.save("{}x_train{}.npy".format(save_location, data_version), subblocks_train)
np.save("{}x_test{}.npy".format(save_location, data_version), subblocks_test)

print("time taken :", time.time() - start)

writing y to csv
saving x to numpy object
time taken : 87644.292844961121
