In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import sys

sys.path.append("../../data_preprocess/")

import pickle
import re
from utils import json_pretty_dump, word2VecContinueLearning, trainWord2VecModelType2, tokenizeData, convertWord2Vec, text_cleansing, parse_datetime
seed = 42
np.random.seed(seed)

data_dir = "../processed_type2/thunderbird_preprocessed_type2"

params = {
    "struct_file": "../../Drain_result/thunderbird_10M.log_structured.csv",
    "template_file": "../../Drain_result/thunderbird_10M.log_templates.csv",
}

os.makedirs(data_dir, exist_ok=True)

In [None]:
struct_log = pd.read_csv(params["struct_file"])
template_log = pd.read_csv(params["template_file"])

In [None]:
struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values(["Date","Time"], inplace=True)

In [None]:
# struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()
split_date = struct_log[struct_log.Label == 1].Date.values[0]

In [None]:
train_set = struct_log[struct_log.Date < split_date]
test_set = struct_log[struct_log.Date >= split_date]
eventId_train = train_set.EventId.unique()
eventId_test = test_set.EventId.unique()

In [None]:
template_log_train = template_log[template_log["EventId"].isin(eventId_train)].reset_index()
template_log_test = template_log[template_log["EventId"].isin(eventId_test)].reset_index()
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)
template_log_test["EventTemplateIdent_cleansed"] = template_log_test.EventTemplateIdent.map(text_cleansing)

In [None]:
template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()
template_log_test_list = template_log_test["EventTemplateIdent_cleansed"].astype('str').tolist()

In [None]:
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [tokenizer.tokenize(sen) for sen in template_log_train_list]
token_test_list = [tokenizer.tokenize(sen) for sen in template_log_test_list]

template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
template_log_test["EventTemplateIdent_token"] = pd.Series(token_test_list)
map_token_train = { row[0]: row[1] \
    for row in tqdm(
        template_log_train[["EventId", "EventTemplateIdent_token"]].values,
        desc="Mapping ID & token"
        ) }
train_set["Token"] = train_set.EventId.map(lambda id: map_token_train[id])
map_token_test = { row[0]: row[1] \
    for row in tqdm(
        template_log_test[["EventId", "EventTemplateIdent_token"]].values,
        desc="Mapping ID & token"
        ) }
test_set["Token"] = test_set.EventId.map(lambda id: map_token_test[id])

In [None]:
trainWord2VecModelType2(token_train_list, "thunderbird_word2Vec")

In [None]:
with open(os.path.join(data_dir, "train_set.pkl"), "wb") as fw:
    pickle.dump(train_set, fw)
with open(os.path.join(data_dir, "test_set.pkl"), "wb") as fw:
    pickle.dump(test_set, fw)
with open(os.path.join(data_dir, "template_train_set.pkl"), "wb") as fw:
    pickle.dump(template_log_train, fw)
with open(os.path.join(data_dir, "template_test_set.pkl"), "wb") as fw:
    pickle.dump(template_log_test, fw)