In [None]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import sys

sys.path.append("../../data_preprocess/")

import pickle
import re
from utils import json_pretty_dump, word2VecContinueLearning, trainWord2VecModelType2, tokenizeData, convertWord2Vec, text_cleansing, parse_datetime, trainWord2VecModel, parse_month
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)

spec_dir = [
    "AIT_auth_mail.cup.com",
    "AIT_auth_mail.insect.com",
    "AIT_auth_mail.onion.com",
    "AIT_auth_mail.spiral.com",
]

webs = [
    "mail.cup.com",
    "mail.insect.com",
    "mail.onion.com",
    "mail.spiral.com",
]

data_dir = f"../processed/ait_auth_preprocessed/{webs[0]}"

params = {
    "log_file": f"../../Drain_result/{spec_dir[0]}/auth.log_structured.csv",
    "template_file": f"../../Drain_result/{spec_dir[0]}/auth.log_templates.csv",
    "label_file": f"../../AIT-LDS-v1_1/labels/{webs[0]}/auth.log",
    "test_ratio": 0.2,
    "train_anomaly_ratio": 0.0,
    "train_word2Vec": True
}

data_name = f'ait_auth_{params["train_anomaly_ratio"]}_tar'

data_dir = os.path.join(data_dir, data_name)
os.makedirs(data_dir, exist_ok=True)

In [None]:
struct_log = pd.read_csv(params["log_file"], engine="c", na_filter=False, memory_map=True)
label_file = pd.read_csv(params["label_file"], engine="c", na_filter=False, memory_map=True)


In [None]:
time_labels = []
line_labels = []
for index,line in label_file.iterrows():
    time_labels.append(0 if line['time_label'] == 0 or line['time_label'] == "0" else 1)
    line_labels.append(0 if line['line_label'] == 0 or line['line_label'] == "0" else 1)
struct_log["time_label"] = time_labels
struct_log["line_label"] = line_labels

In [None]:
eventTemplateTokenTrain = []
eventTemplateTokenTest = []
eventVectors = []

In [None]:
for index, row in struct_log.iterrows():
        struct_log.loc[index, "Month"] = parse_month(row["Month"])

In [None]:
train_data, test_data = train_test_split(struct_log, test_size=params["test_ratio"], random_state=42)
train_data = train_data.loc[(train_data['time_label'] == 0) & (train_data["line_label"] == 0)]

In [None]:
train_data = train_data.sort_values(by=["Month", "Day", "Time"])
test_data = test_data.sort_values(by=["Month", "Day", "Time"])

In [None]:
train_data["EventTemplateIdent_cleansed"] = train_data.EventTemplateIdent.map(text_cleansing)
test_data["EventTemplateIdent_cleansed"] = test_data.EventTemplateIdent.map(text_cleansing)

In [None]:
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
train_data["Token"] = [tokenizer.tokenize(sentence) for sentence in train_data["EventTemplateIdent_cleansed"]]
test_data["Token"] = [tokenizer.tokenize(sentence) for sentence in test_data["EventTemplateIdent_cleansed"]]

In [None]:
trainWord2VecModelType2(train_data["Token"], "auth_word2Vec")

In [None]:
with open(os.path.join(data_dir, "train_set.pkl"), "wb") as fw:
    pickle.dump(train_data, fw)
with open(os.path.join(data_dir, "test_set.pkl"), "wb") as fw:
    pickle.dump(test_data, fw)