## Import Library

In [1]:
import tarfile
import os
import gzip
import shutil
import numpy as np
import pandas as pd
import json
import re
from collections import defaultdict
from tqdm import tqdm
from logparser import Spell, Drain, Drain_bgl
from timeit import default_timer as timer

## Extract Log File

In [2]:
#Dataset Tar File
hdfs_file = "datasets/HDFS_1.tar.gz"
bgl_file = "datasets/BGL.tar.gz"
tbird_file = "datasets/tbird2.gz"

In [3]:
#Make Output Directory
_path = {"hdfs":"datasets/hdfs/", "tbird":"datasets/tbird/", "bgl":"datasets/bgl/"}

for key, path in _path.items() :
  try:
      os.mkdir(path)
  except OSError as error:
      pass

In [4]:
#Extract HDFS LogFile
start = timer()
if os.path.exists("datasets/hdfs/HDFS.log") == False:
    print("Extracting...")
    with tarfile.open(hdfs_file, 'r') as tar_ref:
        tar_ref.extractall(_path["hdfs"])
else : print("Already Extracted")
end = timer()
print("time elapsed : {:.2f}s".format(end-start)) 

Already Extracted
time elapsed : 0.00s


In [5]:
#Extract bgl LogFile
start = timer()
if os.path.exists("datasets/hdfs/HDFS.log") == False:
    print("Extracting...")
    with tarfile.open(bgl_file, 'r') as tar_ref:
        tar_ref.extractall(_path["bgl"])
else : print("Already Extracted")
end = timer()
print("time elapsed : {:.2f}s".format(end-start)) 

Already Extracted
time elapsed : 0.00s


In [6]:
#Extract tbird LogFile
start = timer()
if os.path.exists("datasets/hdfs/HDFS.log") == False:
    print("Extracting...")
    with gzip.open(tbird_file, 'rb') as f_in:
        with open(_path["tbird"] + 'tbird.log', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
else : print("Already Extracted")
end = timer()
print("time elapsed : {:.2f}s".format(end-start)) 

Already Extracted
time elapsed : 0.00s


## Parse Log File

#### Helper Function

In [4]:
def sliding_window(raw_data, para):
    """
    split logs into sliding windows/session
    :param raw_data: dataframe columns=[timestamp, label, eventid, time duration]
    :param para:{window_size: seconds, step_size: seconds}
    :return: dataframe columns=[eventids, time durations, label]
    """
    log_size = raw_data.shape[0]
    label_data, time_data = raw_data.iloc[:, 1], raw_data.iloc[:, 0]
    logkey_data, deltaT_data = raw_data.iloc[:, 2], raw_data.iloc[:, 3]
    new_data = []
    start_end_index_pair = set()

    start_time = time_data[0]
    end_time = start_time + para["window_size"]
    start_index = 0
    end_index = 0

    # get the first start, end index, end time
    for cur_time in time_data:
        if cur_time < end_time:
            end_index += 1
        else:
            break

    start_end_index_pair.add(tuple([start_index, end_index]))

    # move the start and end index until next sliding window
    num_session = 1
    while end_index < log_size:
        start_time = start_time + para['step_size']
        end_time = start_time + para["window_size"]
        for i in range(start_index, log_size):
            if time_data[i] < start_time:
                i += 1
            else:
                break
        for j in range(end_index, log_size):
            if time_data[j] < end_time:
                j += 1
            else:
                break
        start_index = i
        end_index = j

        # when start_index == end_index, there is no value in the window
        if start_index != end_index:
            start_end_index_pair.add(tuple([start_index, end_index]))

        num_session += 1
        if num_session % 1000 == 0:
            print("process {} time window".format(num_session), end='\r')

    for (start_index, end_index) in start_end_index_pair:
        dt = deltaT_data[start_index: end_index].values
        dt[0] = 0
        new_data.append([
            time_data[start_index: end_index].values,
            max(label_data[start_index:end_index]),
            logkey_data[start_index: end_index].values,
            dt
        ])

    assert len(start_end_index_pair) == len(new_data)
    print('there are %d instances (sliding windows) in this dataset\n' % len(start_end_index_pair))
    return pd.DataFrame(new_data, columns=raw_data.columns)

### - HDFS

In [7]:
#PATH
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' #HDFS Log Format
input_dir  = _path["hdfs"]
output_dir = _path["hdfs"] + 'output/'
log_file   = "HDFS.log"
log_structured_file = output_dir + log_file + "_structured.csv"
log_templates_file = output_dir + log_file + "_templates.csv"
log_sequence_file = output_dir + "hdfs_sequence.csv"
log_content_file = output_dir + "hdfs_content.csv"
log_results_file = output_dir + "hdfs_labeled.csv"
log_results_content_file = output_dir + "hdfs_content_labeled.csv"
logkey_content_file = output_dir + "hdfs_content_logkey.csv"

In [9]:
def parser(input_dir, output_dir, log_file, log_format):
  regex = [
      r"(?<=blk_)[-\d]+", #Regex for block_id
      r'\d+\.\d+\.\d+\.\d+',  #Regex for  IP
      r"(/[-\w]+)+",  #Regex for file path
      #r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
  ]
  # hyper parameter di set berdasarkan  http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
  st = 0.5  # Similarity threshold
  depth = 5  # Depth of all leaf nodes

  parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=False)
  parser.parse(log_file)

In [10]:
parser(input_dir, output_dir, log_file, log_format)

Parsing file: datasets/hdfs/HDFS.log
Total size after encoding is 11175629 11175629
Parsing done. [Time taken: 0:17:30.011664]


In [11]:
def mapping():
    log_temp = pd.read_csv(log_templates_file)
    log_temp.sort_values(by = ["Occurrences"], ascending=False, inplace=True)
    log_temp_dict = {event: idx+1 for idx , event in enumerate(list(log_temp["EventId"])) }
    print(log_temp_dict)
    with open (output_dir + "hdfs_log_templates.json", "w") as f:
        json.dump(log_temp_dict, f)

In [12]:
mapping()

{'9b7aa7a3': 1, '2f313c72': 2, '2e1cf0aa': 3, '797b9c47': 4, 'b0023896': 5, 'bb837bbd': 6, '81358cb3': 7, '6caae5bd': 8, 'be6f070c': 9, 'd23206c6': 10, 'fa05ffa7': 11, '53c00e5f': 12, 'd7507d1e': 13, '0d168c98': 14, 'cf9b33dc': 15, 'd6115493': 16, '46f6e99a': 17, 'b46e298a': 18, 'fac2c191': 19, '4ed2a0c0': 20, '5e47c5c3': 21, '1995da3b': 22, '1d48c538': 23, 'b3ef6470': 24, 'ceedf750': 25, 'caed8b80': 26, '6f83a284': 27, '5832ad42': 28, 'c859931b': 29, '5ac8245b': 30, '190eb501': 31, 'f8ba9329': 32, '47b367ea': 33, '1ff93be5': 34, '72988c9b': 35, '78ad37b3': 36, 'ffa3fe68': 37, '60600882': 38, 'b55f27b2': 39, '9111794a': 40, 'ebe1d2fb': 41, 'a26fadbc': 42, 'f52097f4': 43, '0d527039': 44, '17ee882d': 45, 'fb314c6d': 46, '0555f7e9': 47}


In [13]:
def hdfs_content_append(log_file, window='session'):
    assert window == 'session', "Only window=session is supported for HDFS dataset."
    print("Loading", log_file)
    df = pd.read_csv(log_file, engine='c',
            na_filter=False, memory_map=True, dtype={'Date':object, "Time": object})

    with open(output_dir + "hdfs_log_templates.json", "r") as f:
        event_num = json.load(f)
    df["EventId"] = df["EventId"].apply(lambda x: event_num.get(x, -1))

    data_dict = defaultdict(list)
    for idx, row in tqdm(df.iterrows()):
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            data_dict[blk_Id].append(row["EventTemplate"])

    data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'ContentSequence'])
    data_df.to_csv(log_content_file, index=None)
    print("hdfs sampling done")

In [14]:
hdfs_content_append(log_structured_file)

Loading datasets/hdfs/output/HDFS.log_structured.csv


11175629it [08:58, 20770.41it/s]


hdfs sampling done


In [15]:
def hdfs_sequencing(log_file, window='session'):
    assert window == 'session', "Only window=session is supported for HDFS dataset."
    print("Loading", log_file)
    df = pd.read_csv(log_file, engine='c',
            na_filter=False, memory_map=True, dtype={'Date':object, "Time": object})

    # with open(output_dir + "hdfs_log_templates.json", "r") as f:
    #     event_num = json.load(f)
    # df["EventId"] = df["EventId"].apply(lambda x: event_num.get(x, -1))

    data_dict = defaultdict(list)
    for idx, row in tqdm(df.iterrows()):
        blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
        blkId_set = set(blkId_list)
        for blk_Id in blkId_set:
            data_dict[blk_Id].append(row["EventId"])

    data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
    data_df.to_csv(log_sequence_file, index=None)
    print("hdfs sampling done")

In [16]:
hdfs_sequencing(log_structured_file)

Loading datasets/hdfs/output/HDFS.log_structured.csv


11175629it [08:59, 20707.33it/s]


hdfs sampling done


In [23]:
def generate_dataset(hdfs_sequence_file, results=log_results_file):
    blk_label_dict = {}
    blk_label_file = os.path.join(input_dir, "anomaly_label.csv")
    blk_df = pd.read_csv(blk_label_file)
    for _ , row in tqdm(blk_df.iterrows()):
        # blk_label_dict[row["BlockId"]] = row["Label"]
        blk_label_dict[row["BlockId"]] = 1 if row["Label"] == "Anomaly" else 0

    seq = pd.read_csv(hdfs_sequence_file)
    seq["Label"] = seq["BlockId"].apply(lambda x: blk_label_dict.get(x)) #Tambah label ke tiap sequence dari tiap blockid
    seq.to_csv(results, index=None)

In [18]:
generate_dataset(log_sequence_file)

575061it [00:33, 17311.37it/s]


In [24]:
generate_dataset(log_content_file, log_results_content_file)

575061it [00:22, 25789.79it/s]


In [19]:
df = pd.read_csv(log_results_file)
df.head()

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1608999687919862906,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
1,blk_7503483334202473044,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
2,blk_-3544583377289625738,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",1
3,blk_-9073992586687739851,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
4,blk_7854771516489510256,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0


In [20]:
df["Label"].value_counts()

Label
0    558223
1     16838
Name: count, dtype: int64

In [25]:
df = pd.read_csv(log_results_content_file)
df.head()

Unnamed: 0,BlockId,ContentSequence,Label
0,blk_-1608999687919862906,['Receiving block blk_<*> src: <*> dest: /<*>:...,0
1,blk_7503483334202473044,['Receiving block blk_<*> src: <*> dest: /<*>:...,0
2,blk_-3544583377289625738,['Receiving block blk_<*> src: <*> dest: /<*>:...,1
3,blk_-9073992586687739851,['Receiving block blk_<*> src: <*> dest: /<*>:...,0
4,blk_7854771516489510256,['Receiving block blk_<*> src: <*> dest: /<*>:...,0


In [26]:
df["Label"].value_counts()

Label
0    558223
1     16838
Name: count, dtype: int64

In [33]:
df_logkey = pd.read_csv(log_results_file)

In [34]:
df_content = pd.read_csv(log_results_content_file)

In [35]:
merged_df = pd.merge(df, df_content[["BlockId", "ContentSequence"]], on="BlockId", how="inner")
merged_df = merged_df[["BlockId", "EventSequence", "ContentSequence", "Label"]]

In [36]:
merged_df.head()

Unnamed: 0,BlockId,EventSequence,ContentSequence,Label
0,blk_-1608999687919862906,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",['Receiving block blk_<*> src: <*> dest: /<*>:...,0
1,blk_7503483334202473044,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",['Receiving block blk_<*> src: <*> dest: /<*>:...,0
2,blk_-3544583377289625738,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",['Receiving block blk_<*> src: <*> dest: /<*>:...,1
3,blk_-9073992586687739851,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",['Receiving block blk_<*> src: <*> dest: /<*>:...,0
4,blk_7854771516489510256,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",['Receiving block blk_<*> src: <*> dest: /<*>:...,0


In [38]:
merged_df.isnull().sum()

BlockId            0
EventSequence      0
ContentSequence    0
Label              0
dtype: int64

In [39]:
merged_df["Label"].value_counts()

Label
0    558223
1     16838
Name: count, dtype: int64

In [40]:
merged_df.to_csv(logkey_content_file, index=None)

### - BGL

In [61]:
#PATH and VAR
data_dir = _path["bgl"]
output_dir = _path["bgl"] + "output/"
log_file = "BGL.log"
log_structured_file  = output_dir + log_file + "_structured.csv"
log_results_file = output_dir + "bgl_time_windowed.csv"
log_results_content_file = output_dir + "bgl_time_windowed_content.csv"
logkey_content_file = output_dir + "bgl_content_logkey.csv"
window_size = 5
step_size = 1

In [28]:
def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+', #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes
        parser = Drain_bgl.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para)
        parser.parse(log_file)
    elif parser_type == "spell":
        tau = 0.55
        parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para)
        parser.parse(log_file)

In [23]:
parse_log(data_dir, output_dir, log_file, 'drain')

Parsing file: datasets/bgl/BGL.log
Total size after encoding is 4713493 4747963
Parsing done. [Time taken: 0:36:26.956195]


In [32]:
df = pd.read_csv(log_structured_file)

In [33]:
df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
df['datetime'] = pd.to_datetime(df['Time'], format='%Y-%m-%d-%H.%M.%S.%f')
df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
df['deltaT'] = df['deltaT'].fillna(0)

In [26]:
start = timer()
bgl_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT"]],
                                para={"window_size": int(window_size)*60, "step_size": int(step_size) * 60}
                                )
end = timer()
print("time elapsed : {:.2f}s".format(end-start)) 

there are 37315 instances (sliding windows) in this dataset

time elapsed : 30.52s


In [45]:
bgl_df.to_csv(log_results_file, index = None)

In [35]:
start = timer()
bgl_df = sliding_window(df[["timestamp", "Label", "EventTemplate", "deltaT"]],
                                para={"window_size": int(window_size)*60, "step_size": int(step_size) * 60}
                                )
end = timer()
print("time elapsed : {:.2f}s".format(end-start)) 

there are 37315 instances (sliding windows) in this dataset

time elapsed : 24.23s


In [36]:
bgl_df.to_csv(log_results_content_file, index = None)

In [46]:
df = pd.read_csv(log_results_file)
df.head()

Unnamed: 0,timestamp,Label,EventId,deltaT
0,[1135617911 1135617913 1135617915 1135617915 1...,0,['3aa50e45' '3aa50e45' '3aa50e45' '3aa50e45' '...,[0. 1.601077 1.581271 0.577114 2.089764 ...
1,[1127138101 1127138101 1127138101 1127138101 1...,0,['4983ff07' '4983ff07' '4983ff07' '4983ff07' '...,[0.0000000e+00 1.3094000e-01 7.3317000e-02 1.5...
2,[1134118610 1134118610 1134118610 1134118610 1...,0,['30b3b946' '8df7ac9e' 'a450c390' 'a450c390' '...,[0. 0.109872 0.024515 0.034294 0.035576 ...
3,[1136277503 1136277503 1136277503 1136277503 1...,0,['30b3b946' '8df7ac9e' '30b3b946' '8df7ac9e' '...,[0. 0.115572 0.032737 0.016155 0. ...
4,[1120820379],0,['8a1ae52c'],[0.]


In [47]:
df["Label"].value_counts()

Label
0    34297
1     3018
Name: count, dtype: int64

In [37]:
df = pd.read_csv(log_results_content_file)
df.head()

Unnamed: 0,timestamp,Label,EventTemplate,deltaT
0,[1135617911 1135617913 1135617915 1135617915 1...,0,['instruction cache parity error corrected'\n ...,[0. 1.601077 1.581271 0.577114 2.089764 ...
1,[1127138101 1127138101 1127138101 1127138101 1...,0,['ciod: LOGIN <*> failed: No such file or dire...,[0.0000000e+00 1.3094000e-01 7.3317000e-02 1.5...
2,[1134118610 1134118610 1134118610 1134118610 1...,0,['<*> ddr error(s) detected and corrected on r...,[0. 0.109872 0.024515 0.034294 0.035576 ...
3,[1136277503 1136277503 1136277503 1136277503 1...,0,['<*> ddr error(s) detected and corrected on r...,[0. 0.115572 0.032737 0.016155 0. ...
4,[1120820379],0,['<*> Message=Invalid JtagId = ffffffff'],[0.]


In [38]:
df["Label"].value_counts()

Label
0    34297
1     3018
Name: count, dtype: int64

In [62]:
df_logkey = pd.read_csv(log_results_file)
df_content = pd.read_csv(log_results_content_file)

In [63]:
merged_df = pd.merge(df_logkey, df_content[["timestamp", "EventTemplate"]], on="timestamp", how="inner")
merged_df = merged_df[["timestamp", "EventId", "EventTemplate", "Label"]]

In [64]:
merged_df.head()

Unnamed: 0,timestamp,EventId,EventTemplate,Label
0,[1135617911 1135617913 1135617915 1135617915 1...,['3aa50e45' '3aa50e45' '3aa50e45' '3aa50e45' '...,['instruction cache parity error corrected'\n ...,0
1,[1127138101 1127138101 1127138101 1127138101 1...,['4983ff07' '4983ff07' '4983ff07' '4983ff07' '...,['ciod: LOGIN <*> failed: No such file or dire...,0
2,[1134118610 1134118610 1134118610 1134118610 1...,['30b3b946' '8df7ac9e' 'a450c390' 'a450c390' '...,['<*> ddr error(s) detected and corrected on r...,0
3,[1136277503 1136277503 1136277503 1136277503 1...,['30b3b946' '8df7ac9e' '30b3b946' '8df7ac9e' '...,['<*> ddr error(s) detected and corrected on r...,0
4,[1120820379],['8a1ae52c'],['<*> Message=Invalid JtagId = ffffffff'],0


In [65]:
merged_df.isnull().sum()

timestamp        0
EventId          0
EventTemplate    0
Label            0
dtype: int64

In [66]:
merged_df["Label"].value_counts()

Label
0    34297
1     3018
Name: count, dtype: int64

In [67]:
merged_df.to_csv(logkey_content_file, index=None)
print("Saved at : " + logkey_content_file)

Saved at : datasets/bgl/output/bgl_content_logkey.csv


### - Thunderbird

In [68]:
#PATH and VAR
raw_log = _path["tbird"] + "tbird.log"
sample_log = _path["tbird"] + "tbird_5M.log"
sample_window_size = 5*10**6
sample_step_size = 10**3
data_dir = _path["tbird"]
output_dir = _path["tbird"] + "output/"
log_file = "tbird_5M.log"
window_size = 1
step_size = 0.5
log_results_file = output_dir + "tbird_time_windowed_5M.csv"
log_results_content_file = output_dir + "tbird_time_windowed_5M_content.csv"
logkey_content_file = output_dir + "tbird_content_logkey.csv"

In [7]:
def sample_raw_data(data_file, output_file, sample_window_size, sample_step_size):
    # sample 1M by sliding window, abnormal rate is over 2%
    sample_data = []
    labels = []
    idx = 0

    # spirit dataset can start from the 2Mth line, as there are many abnormal lines gathering in the first 2M
    with open(data_file, 'r', errors='ignore') as f:
        for line in f:
            labels.append(line.split()[0] != '-')
            sample_data.append(line)

            if len(labels) == sample_window_size:
                abnormal_rate = sum(np.array(labels)) / len(labels)
                print(f"{idx + 1} lines, abnormal rate {abnormal_rate}")
                break

            idx += 1
            if idx % sample_step_size == 0:
                print(f"Process {round(idx/sample_window_size * 100,4)} % raw data", end='\r')

    with open(output_file, "w") as f:
        f.writelines(sample_data)

    print("Sampling done")

In [12]:
start = timer()
sample_raw_data(raw_log, sample_log, sample_window_size, sample_step_size)
end = timer()
print("time elapsed : {:.2f}s".format(end-start))

5000000 lines, abnormal rate 0.0452574
Sampling done
time elapsed : 18.66s


In [13]:
def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  # hexadecimal
        r'\d+\.\d+\.\d+\.\d+',
        r'(?<=Warning: we failed to resolve data source name )[\w\s]+',
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes

        # Drain is modified
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para, maxChild=1000)
        parser.parse(log_file)

    elif parser_type == "spell":
        tau = 0.35
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)

In [14]:
parse_log(data_dir, output_dir, log_file, "drain")

Parsing file: datasets/tbird/tbird_5M.log
Total size after encoding is 5000000 5000000
Parsing done. [Time taken: 0:13:29.444664]


In [40]:
df = pd.read_csv(output_dir + "tbird_5M.log_structured.csv")

In [41]:
df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
df['datetime'] = pd.to_datetime(df["Date"] + " " + df['Time'], format='%Y.%m.%d %H:%M:%S')
df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
df['deltaT'] = df['deltaT'].fillna(0)

In [17]:
start = timer()
window_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT"]], 
                           para={"window_size": float(1)*60, "step_size": float(0.5) * 60})
end = timer()
print("time elapsed : {:.2f}s".format(end-start))

there are 24708 instances (sliding windows) in this dataset

time elapsed : 23.52s


In [18]:
window_df.to_csv(log_results_file, index = None)

In [42]:
start = timer()
window_df = sliding_window(df[["timestamp", "Label", "EventTemplate", "deltaT"]], 
                           para={"window_size": float(1)*60, "step_size": float(0.5) * 60})
end = timer()
print("time elapsed : {:.2f}s".format(end-start))

there are 24708 instances (sliding windows) in this dataset

time elapsed : 33.62s


In [43]:
window_df.to_csv(log_results_content_file, index = None)

In [19]:
df = pd.read_csv(log_results_file)
df.head()

Unnamed: 0,timestamp,Label,EventId,deltaT
0,[1131813901 1131813901 1131813902 1131813902 1...,0,['e77fd980' 'e77fd980' '5ffb1ca0' 'bcf72cf0' '...,[0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. ...
1,[1131994772 1131994772 1131994772 1131994772 1...,0,['5ffb1ca0' 'bcf72cf0' '5ffb1ca0' 'bcf72cf0' '...,[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. ...
2,[1131538261 1131538262 1131538262 1131538264 1...,0,['1800ad2a' '7ececea2' '750f8375' 'e77fd980' '...,[0. 1. 0. 2. 1. 2. 0. 1. 4. 0. 1. 0. 0. 1. 1. ...
3,[1132261351 1132261351 1132261351 1132261351 1...,1,['5e3e6435' 'bcf72cf0' '5ffb1ca0' 'bcf72cf0' '...,[0. 0. 0. 0. 1. 2. 0. 0. 1. 0. 1. 0. 0. 1. 0. ...
4,[1131919651 1131919651 1131919651 1131919651 1...,1,['9e29cff3' '9e29cff3' '9e29cff3' '13fafa5b' '...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [43]:
df["Label"].value_counts()

Label
0    5838
1     639
Name: count, dtype: int64

In [20]:
df["Label"].value_counts()

Label
1    13007
0    11701
Name: count, dtype: int64

In [44]:
df = pd.read_csv(log_results_content_file)
df.head()

Unnamed: 0,timestamp,Label,EventTemplate,deltaT
0,[1131813901 1131813901 1131813902 1131813902 1...,0,['/apps/x<*>_<*>/system/ganglia-<*>.<*>.<*>/sb...,[0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. ...
1,[1131994772 1131994772 1131994772 1131994772 1...,0,['Server Administrator: Instrumentation Servic...,[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. ...
2,[1131538261 1131538262 1131538262 1131538264 1...,0,['ntpd[<*>]: synchronized to <*> stratum <*>'\...,[0. 1. 0. 2. 1. 2. 0. 1. 4. 0. 1. 0. 0. 1. 1. ...
3,[1132261351 1132261351 1132261351 1132261351 1...,1,['Server Administrator: Instrumentation Servic...,[0. 0. 0. 0. 1. 2. 0. 0. 1. 0. 1. 0. 0. 1. 0. ...
4,[1131919651 1131919651 1131919651 1131919651 1...,1,['kernel: THH(<*>): <*> <*> Device in FATAL st...,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [45]:
df["Label"].value_counts()

Label
1    13007
0    11701
Name: count, dtype: int64

In [69]:
df_logkey = pd.read_csv(log_results_file)
df_content = pd.read_csv(log_results_content_file)

In [70]:
merged_df = pd.merge(df_logkey, df_content[["timestamp", "EventTemplate"]], on="timestamp", how="inner")
merged_df = merged_df[["timestamp", "EventId", "EventTemplate", "Label"]]

In [72]:
merged_df.head()

Unnamed: 0,timestamp,EventId,EventTemplate,Label
0,[1131813901 1131813901 1131813902 1131813902 1...,['e77fd980' 'e77fd980' '5ffb1ca0' 'bcf72cf0' '...,['/apps/x<*>_<*>/system/ganglia-<*>.<*>.<*>/sb...,0
1,[1131994772 1131994772 1131994772 1131994772 1...,['5ffb1ca0' 'bcf72cf0' '5ffb1ca0' 'bcf72cf0' '...,['Server Administrator: Instrumentation Servic...,0
2,[1131538261 1131538262 1131538262 1131538264 1...,['1800ad2a' '7ececea2' '750f8375' 'e77fd980' '...,['ntpd[<*>]: synchronized to <*> stratum <*>'\...,0
3,[1132261351 1132261351 1132261351 1132261351 1...,['5e3e6435' 'bcf72cf0' '5ffb1ca0' 'bcf72cf0' '...,['Server Administrator: Instrumentation Servic...,1
4,[1131919651 1131919651 1131919651 1131919651 1...,['9e29cff3' '9e29cff3' '9e29cff3' '13fafa5b' '...,['kernel: THH(<*>): <*> <*> Device in FATAL st...,1


In [73]:
merged_df.isnull().sum()

timestamp        0
EventId          0
EventTemplate    0
Label            0
dtype: int64

In [75]:
merged_df["Label"].value_counts()

Label
1    13007
0    11701
Name: count, dtype: int64

In [76]:
merged_df.to_csv(logkey_content_file, index=None)
print("Saved at : " + logkey_content_file)

Saved at : datasets/tbird/output/tbird_content_logkey.csv


### SANDBOX

In [42]:
df = pd.read_csv(log_results_file)

In [43]:
df.head()

Unnamed: 0,timestamp,Label,EventId,deltaT
0,[1135617911 1135617913 1135617915 1135617915 1...,0,['3aa50e45' '3aa50e45' '3aa50e45' '3aa50e45' '...,[0. 1.601077 1.581271 0.577114 2.089764 ...
1,[1127138101 1127138101 1127138101 1127138101 1...,0,['4983ff07' '4983ff07' '4983ff07' '4983ff07' '...,[0.0000000e+00 1.3094000e-01 7.3317000e-02 1.5...
2,[1134118610 1134118610 1134118610 1134118610 1...,0,['30b3b946' '8df7ac9e' 'a450c390' 'a450c390' '...,[0. 0.109872 0.024515 0.034294 0.035576 ...
3,[1136277503 1136277503 1136277503 1136277503 1...,0,['30b3b946' '8df7ac9e' '30b3b946' '8df7ac9e' '...,[0. 0.115572 0.032737 0.016155 0. ...
4,[1120820379],0,['8a1ae52c'],[0.]


In [44]:
df_content = pd.read_csv(log_results_content_file)

In [45]:
df_content.head()

Unnamed: 0,timestamp,Label,EventTemplate,deltaT
0,[1135617911 1135617913 1135617915 1135617915 1...,0,['instruction cache parity error corrected'\n ...,[0. 1.601077 1.581271 0.577114 2.089764 ...
1,[1127138101 1127138101 1127138101 1127138101 1...,0,['ciod: LOGIN <*> failed: No such file or dire...,[0.0000000e+00 1.3094000e-01 7.3317000e-02 1.5...
2,[1134118610 1134118610 1134118610 1134118610 1...,0,['<*> ddr error(s) detected and corrected on r...,[0. 0.109872 0.024515 0.034294 0.035576 ...
3,[1136277503 1136277503 1136277503 1136277503 1...,0,['<*> ddr error(s) detected and corrected on r...,[0. 0.115572 0.032737 0.016155 0. ...
4,[1120820379],0,['<*> Message=Invalid JtagId = ffffffff'],[0.]


In [46]:
merged_df = pd.merge(df, df_content[["timestamp", "EventTemplate"]], on="timestamp", how="inner")
merged_df = merged_df[["timestamp", "EventId", "EventTemplate", "Label"]]

In [47]:
merged_df.head()

Unnamed: 0,timestamp,EventId,EventTemplate,Label
0,[1135617911 1135617913 1135617915 1135617915 1...,['3aa50e45' '3aa50e45' '3aa50e45' '3aa50e45' '...,['instruction cache parity error corrected'\n ...,0
1,[1127138101 1127138101 1127138101 1127138101 1...,['4983ff07' '4983ff07' '4983ff07' '4983ff07' '...,['ciod: LOGIN <*> failed: No such file or dire...,0
2,[1134118610 1134118610 1134118610 1134118610 1...,['30b3b946' '8df7ac9e' 'a450c390' 'a450c390' '...,['<*> ddr error(s) detected and corrected on r...,0
3,[1136277503 1136277503 1136277503 1136277503 1...,['30b3b946' '8df7ac9e' '30b3b946' '8df7ac9e' '...,['<*> ddr error(s) detected and corrected on r...,0
4,[1120820379],['8a1ae52c'],['<*> Message=Invalid JtagId = ffffffff'],0


In [32]:
merged_df.isnull().sum()

BlockId            0
EventSequence      0
ContentSequence    0
Label              0
dtype: int64