## Import Library

In [1]:
import pandas as pd
import os

## Path

In [2]:
hdfs_output_dir = "datasets/hdfs/output/"
hdfs_file = "hdfs_labeled.csv"
hdfs_content_file = "hdfs_content_labeled.csv"
hdfs_dataset = hdfs_output_dir + hdfs_file
hdfs_template = hdfs_output_dir + "HDFS.log_templates.csv"
hdfs_dataset_dir = hdfs_output_dir + "dataset/"

bgl_output_dir = "datasets/bgl/output/"
bgl_file = "bgl_time_windowed.csv"
bgl_content_file = "bgl_time_windowed_content.csv"
bgl_dataset = bgl_output_dir + bgl_file
bgl_template = bgl_output_dir + "BGL.log_templates.csv"
bgl_dataset_dir = bgl_output_dir + "dataset/"

tbird_output_dir = "datasets/tbird/output/"
tbird_file = "tbird_time_windowed_5M.csv"
tbird_content_file = "tbird_time_windowed_5M_content.csv"
tbird_dataset = tbird_output_dir + tbird_file
tbird_dataset = tbird_output_dir + "tbird_time_windowed_5M.csv"
tbird_template = tbird_output_dir + "tbird_5M.log_templates.csv"
tbird_dataset_dir = tbird_output_dir + "dataset/"

In [3]:
output_path = [hdfs_output_dir, bgl_output_dir, tbird_output_dir]
for path in output_path :
  try:
      os.mkdir(path + "dataset/")
  except OSError as error:
      pass

## Generate Train, Test, and Eval Datasets

In [4]:
def generate_train_test(log_dir, log_file, output_dir, mode=None, content=False, n=5000, test_n=1000):
    df = pd.read_csv(log_dir + log_file)
    if mode == 'hdfs':
        if content : 
            df = df[['ContentSequence', 'Label']]
            df.rename(columns={'ContentSequence': 'text'}, inplace=True)
            df.rename(columns={'Label': 'label'}, inplace=True)
        else :
            df = df[['EventSequence', 'Label']]
            df.rename(columns={'EventSequence': 'text'}, inplace=True)
            df.rename(columns={'Label': 'label'}, inplace=True)
    else :
        if content : 
            df = df[['EventTemplate', 'Label']]
            df.rename(columns={'EventTemplate': 'text'}, inplace=True)
            df.rename(columns={'Label': 'label'}, inplace=True)
        else :
            df = df[['EventId', 'Label']]
            df.rename(columns={'EventId': 'text'}, inplace=True)
            df.rename(columns={'Label': 'label'}, inplace=True)
    print("===== Original Dataset =====")
    print(df['label'].value_counts())
        
    # train df  
    normal_seq = df[df["label"] == 0].sample(n, random_state=20)
    anomaly_seq = df[df["label"] == 1].sample(test_n, random_state=20)
    train_df = pd.concat([normal_seq, anomaly_seq], ignore_index=False, sort=False)
    print("\n===== Training Dataset =====")
    print(train_df['label'].value_counts())
    train_df.to_csv(output_dir + 'train_' + log_file, index = None)
    print('saved to : ' + output_dir + 'train_' + log_file)

    # eval df 
    df_eval = df.drop(train_df.index)
    normal_seq = df_eval[df_eval["label"] == 0].sample(test_n, random_state=20)
    anomaly_seq = df_eval[df_eval["label"] == 1].sample(test_n, random_state=20)
    eval_df = pd.concat([normal_seq, anomaly_seq], ignore_index=False, sort=False)
    print("\n===== Evaluate Dataset =====")
    print(eval_df['label'].value_counts())
    eval_df.to_csv(output_dir + 'eval_' + log_file, index = None)
    print('saved to : ' + output_dir + 'eval_' + log_file)

    # test df
    df_test = df_eval.drop(eval_df.index)
    normal_seq = df_test[df_test["label"] == 0].sample(test_n, random_state=20)
    anomaly_seq = df_test[df_test["label"] == 1].sample(test_n, random_state=20)
    test_df = pd.concat([normal_seq, anomaly_seq], ignore_index=False, sort=False)
    print("\n===== Testing Dataset =====")
    print(test_df['label'].value_counts())
    test_df.to_csv(output_dir + 'test_' + log_file, index = None)
    print('saved to : ' + output_dir + 'test_' + log_file)
    
    

### HDFS

In [5]:
generate_train_test(hdfs_output_dir, hdfs_file, hdfs_dataset_dir, mode='hdfs')

===== Original Dataset =====
label
0    558223
1     16838
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/train_hdfs_labeled.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/eval_hdfs_labeled.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/test_hdfs_labeled.csv


In [6]:
generate_train_test(hdfs_output_dir, hdfs_content_file, hdfs_dataset_dir, mode='hdfs', content=True)

===== Original Dataset =====
label
0    558223
1     16838
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/train_hdfs_content_labeled.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/eval_hdfs_content_labeled.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/hdfs/output/dataset/test_hdfs_content_labeled.csv


### BGL

In [7]:
generate_train_test(bgl_output_dir, bgl_file, bgl_dataset_dir)

===== Original Dataset =====
label
0    34297
1     3018
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/train_bgl_time_windowed.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/eval_bgl_time_windowed.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/test_bgl_time_windowed.csv


In [8]:
generate_train_test(bgl_output_dir, bgl_content_file, bgl_dataset_dir, content=True)

===== Original Dataset =====
label
0    34297
1     3018
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/train_bgl_time_windowed_content.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/eval_bgl_time_windowed_content.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/bgl/output/dataset/test_bgl_time_windowed_content.csv


### Thunderbird

In [9]:
generate_train_test(tbird_output_dir, tbird_file, tbird_dataset_dir)

===== Original Dataset =====
label
1    13007
0    11701
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/train_tbird_time_windowed_5M.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/eval_tbird_time_windowed_5M.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/test_tbird_time_windowed_5M.csv


In [10]:
generate_train_test(tbird_output_dir, tbird_content_file, tbird_dataset_dir,content=True)

===== Original Dataset =====
label
1    13007
0    11701
Name: count, dtype: int64

===== Training Dataset =====
label
0    5000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/train_tbird_time_windowed_5M_content.csv

===== Evaluate Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/eval_tbird_time_windowed_5M_content.csv

===== Testing Dataset =====
label
0    1000
1    1000
Name: count, dtype: int64
saved to : datasets/tbird/output/dataset/test_tbird_time_windowed_5M_content.csv


## SANDBOX

### SANDBOX 1 Generate Train Test Dataset

In [4]:
df = pd.read_csv(hdfs_dataset)
df["Label"].value_counts()

Label
0    558223
1     16838
Name: count, dtype: int64

In [5]:
df = df.drop("BlockId", axis='columns')
df.head()

Unnamed: 0,EventSequence,Label
0,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
1,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
2,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",1
3,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
4,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0


In [6]:
df.rename(columns={'EventSequence': 'text'}, inplace=True)
df.rename(columns={'Label': 'label'}, inplace=True)
df.head()

Unnamed: 0,text,label
0,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
1,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
2,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",1
3,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
4,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0


#### Training Dataset

In [7]:
normal_seq = df[df["label"] == 0].sample(n=5000, random_state=1)
normal_seq

Unnamed: 0,text,label
302956,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
270626,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
309653,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
554173,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
305876,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
...,...,...
380008,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
508484,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
370913,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
485153,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0


In [8]:
anomaly_seq = df[df["label"] == 1].sample(n=1000, random_state=1)
anomaly_seq

Unnamed: 0,text,label
446751,"['9b7aa7a3', '9b7aa7a3', '81358cb3', 'd6115493']",1
374508,"['9b7aa7a3', '81358cb3']",1
14285,"['81358cb3', '9b7aa7a3']",1
435425,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
314894,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",1
...,...,...
393226,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",1
440391,"['9b7aa7a3', '9b7aa7a3', '81358cb3', 'd6115493']",1
359151,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1
223908,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1


In [9]:
hdfs_df = pd.concat([normal_seq, anomaly_seq], ignore_index=False, sort=False)
hdfs_df

Unnamed: 0,text,label
302956,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
270626,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
309653,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
554173,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
305876,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
...,...,...
393226,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",1
440391,"['9b7aa7a3', '9b7aa7a3', '81358cb3', 'd6115493']",1
359151,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1
223908,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1


In [10]:
hdfs_df["label"].value_counts()

label
0    5000
1    1000
Name: count, dtype: int64

In [11]:
hdfs_df.to_csv(hdfs_output_dir + "dataset/hdfs_dataset_training.csv", index = None)

#### Evaluate Dataset

In [12]:
df_eval = df.drop(hdfs_df.index)
df_eval["label"].value_counts()

label
0    553223
1     15838
Name: count, dtype: int64

In [13]:
normal_eval = df_eval[df_eval["label"] == 0].sample(n=1000, random_state=1)
normal_eval

Unnamed: 0,text,label
161678,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
539711,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
38127,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
424086,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
242162,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
...,...,...
463056,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
29838,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
17471,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
395077,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0


In [14]:
anomaly_eval = df_eval[df_eval["label"] == 1].sample(n=1000, random_state=1)
anomaly_eval

Unnamed: 0,text,label
139968,"['9b7aa7a3', '81358cb3', '9b7aa7a3', 'd6115493']",1
195383,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
150642,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",1
21575,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
19226,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1
...,...,...
224799,"['9b7aa7a3', '81358cb3']",1
365867,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
320036,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1
453834,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1


In [15]:
hdfs_eval = pd.concat([normal_eval, anomaly_eval], ignore_index=False, sort=False)
hdfs_eval

Unnamed: 0,text,label
161678,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
539711,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
38127,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
424086,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
242162,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0
...,...,...
224799,"['9b7aa7a3', '81358cb3']",1
365867,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
320036,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1
453834,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1


In [16]:
hdfs_eval["label"].value_counts()

label
0    1000
1    1000
Name: count, dtype: int64

In [17]:
hdfs_eval.to_csv(hdfs_output_dir + "dataset/hdfs_dataset_evaluate.csv", index = None)

#### Test Dataset

In [18]:
df_test = df_eval.drop(hdfs_eval.index)
df_test["label"].value_counts()

label
0    552223
1     14838
Name: count, dtype: int64

In [19]:
normal_test = df_test[df_test["label"] == 0].sample(n=1000, random_state=1)
normal_test

Unnamed: 0,text,label
317296,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
318352,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
157056,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
465104,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
348242,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
...,...,...
432333,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
349605,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
541168,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
306591,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",0


In [20]:
anomaly_test = df_test[df_test["label"] == 1].sample(n=1000, random_state=1)
anomaly_test

Unnamed: 0,text,label
132340,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1
18906,"['9b7aa7a3', '81358cb3']",1
438434,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",1
83924,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1
478666,"['9b7aa7a3', '81358cb3']",1
...,...,...
11499,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1
254986,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
358185,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
73889,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1


In [21]:
hdfs_test = pd.concat([normal_test, anomaly_test], ignore_index=False, sort=False)
hdfs_test

Unnamed: 0,text,label
317296,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
318352,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3...",0
157056,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
465104,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3...",0
348242,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",0
...,...,...
11499,"['81358cb3', '9b7aa7a3', '9b7aa7a3', '9b7aa7a3...",1
254986,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
358185,"['9b7aa7a3', '9b7aa7a3', '9b7aa7a3', '81358cb3...",1
73889,"['81358cb3', '9b7aa7a3', '9b7aa7a3', 'd6115493']",1


In [22]:
hdfs_test["label"].value_counts()

label
0    1000
1    1000
Name: count, dtype: int64

In [23]:
hdfs_test.to_csv(hdfs_output_dir + "dataset/hdfs_dataset_test.csv", index = None)