In [10]:
import numpy as np
import pandas as pd
import re
from collections import OrderedDict
from collections import Counter
from PIL import Image

In [9]:
import torch
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
import numpy as np
from collections import OrderedDict

model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [4]:
def collect_event_ids(data_frame, regex_pattern, column_names):
    """
    turns input data_frame into a 2 columned dataframe
    with columns: BlockId, EventSequence
    where EventSequence is a list of the events that happened to the block
    """
    data_dict = OrderedDict()
    for _, row in data_frame.iterrows():
        blk_id_list = re.findall(regex_pattern, row["Content"])
        blk_id_set = set(blk_id_list)
        for blk_id in blk_id_set:
            if blk_id not in data_dict:
                data_dict[blk_id] = []
            data_dict[blk_id].append(row["EventId"])
    data_df = pd.DataFrame(list(data_dict.items()), columns=column_names)
    return data_df

In [5]:
def windower(sequence, window_size):
    """
    creates an array of arrays of windows
    output array is of length: len(sequence) - window_size + 1
    """
    return np.lib.stride_tricks.sliding_window_view(sequence, window_size)

In [6]:
def sequence_padder(sequence, required_length):
    """
    right pads events sequence until max sequence length long
    """
    if len(sequence) > required_length:
        return sequence
    return np.pad(
        sequence,
        (0, required_length - len(sequence)),
        mode="constant",
        constant_values=(0),
    )

In [44]:
def resize_time_image(time_image, size):
    """
    compresses time images that had more sequences than the set max sequence length
    """
    width = size[1]
    height = size[0]
    return np.array(Image.fromarray(time_image).resize((width, height)))

In [50]:
class FeatureExtractor(object):
    """
    class for fitting and transforming the training set
    then transforming the testing set
    """

    def __init__(self):
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaModel.from_pretrained('roberta-base')
        self.model.eval()
    
    def get_roberta_features(log_contents):
    """
    Convert log contents into RoBERTa embeddings.
    """
    inputs = tokenizer(log_contents, padding=True, truncation=True, max_length=512, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

    def fit_transform(self, X_seq)
        return self.transform(X_seq)

    def transform(self, sequences):
        """
        Converts a list of text sequences into a matrix of RoBERTa embeddings.
        """
        features = []
        for sequence in sequences:
            text = ' '.join(sequence)  # Convert list of events to a single string if necessary
            embeddings = self.get_roberta_features(text)
            features.append(embeddings)
        return np.array(features)


In [46]:
import numpy as np
import pandas as pd
import time

if __name__ == "__main__":

    data_version = "_v5"

    data_version = "_roberta{}".format(data_version)

    # where the "raw" data for this file is located
    load_data_location = "./project_processed_data/"

    # where the processed data is saved
    save_location = "./project_processed_data/{}/".format(data_version)

    start = time.time()

    # Loads data
    print("loading x_train")
    x_train = pd.read_csv("{}HDFS_train.log_structured.csv".format(load_data_location))

    print("loading x_test")
    x_test = pd.read_csv("{}HDFS_test.log_structured.csv".format(load_data_location))

    print("loading y")
    y = pd.read_csv("{}anomaly_label.csv".format(load_data_location))

    # processes events into blocks
    re_pat = r"blk_-[0-9]*"
    col_names = ["BlockId", "EventSequence"]

    print("collecting events for x_train")
    events_train = collect_event_ids(x_train, re_pat, col_names)
    print("collecting events for x_test")
    events_test = collect_event_ids(x_test, re_pat, col_names)

    print("merging block frames with labels")
    events_train = events_train.merge(y, on="BlockId")
    events_test = events_test.merge(y, on="BlockId")

    print("removing blocks that are overlapped into train and test")
    overlapping_blocks = np.intersect1d(events_train["BlockId"], events_test["BlockId"])
    events_train = events_train[~events_train["BlockId"].isin(overlapping_blocks)]
    events_test = events_test[~events_test["BlockId"].isin(overlapping_blocks)]

    events_train_values = events_train["EventSequence"].values
    events_test_values = events_test["EventSequence"].values
    

loading x_train
loading x_test
loading y
collecting events for x_train
collecting events for x_test
merging block frames with labels
removing blocks that are overlapped into train and test


In [59]:
y

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal
...,...,...
575056,blk_1019720114020043203,Normal
575057,blk_-2683116845478050414,Normal
575058,blk_5595059397348477632,Normal
575059,blk_1513937873877967730,Normal


In [58]:
# fit transform & transform
fe = FeatureExtractor()

print("fit_transform x_train")
subblocks_train = fe.fit_transform(
    events_train_values,
    length_percentile=95,
    window_size=16,
)

print("transform x_test")
subblocks_test = fe.transform(events_test_values)

print("collecting y data")
y_train = events_train[["BlockId", "Label"]]
y_test = events_test[["BlockId", "Label"]]

fit_transform x_train
final shape will be  16 43
train data shape:  (197301, 16, 43)
transform x_test
test data shape:  (60336, 16, 43)
collecting y data


In [56]:
# saving files
print("writing y to csv")
y_train.to_csv("{}y_train{}.csv".format(save_location, data_version))
y_test.to_csv("{}y_test{}.csv".format(save_location, data_version))

print("saving x to numpy object")
np.save("{}x_train{}.npy".format(save_location, data_version), subblocks_train)
np.save("{}x_test{}.npy".format(save_location, data_version), subblocks_test)

print("time taken :", time.time() - start)

writing y to csv
saving x to numpy object
time taken : 5771.833292961121
