In [7]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import sys

sys.path.append("../../data_preprocess/")

import pickle
import re
from utils import json_pretty_dump, word2VecContinueLearning, trainWord2VecModelType2, tokenizeData, convertWord2Vec, text_cleansing, parse_datetime, parse_month
seed = 42
np.random.seed(seed)

# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../processed_type2/AIT_preprocessed_type2/mail/<Target>
data_dir = "../processed_type2/AIT_preprocessed_type2/mail/mail.cup.com"

label_dir = [
    "mail.cup.com",
    "mail.insect.com",
    "mail.onion.com",
    "mail.spiral.com",
]

# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../../Drain_result/AIT_mail_<target>/mail.log_structured.csv
# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../../Drain_result/AIT_mail_<target>/mail.log_templates.csv
params = {
    "struct_file": "../../Drain_result/AIT_mail_mail.cup.com/mail.log_structured.csv",
    "template_file": "../../Drain_result/AIT_mail_mail.cup.com/mail.log_templates.csv",
    "label_file": f"../../AIT-LDS-v1_1/labels/{label_dir[0]}/apache2/mail.cup.com-access.log",
}

os.makedirs(data_dir, exist_ok=True)

In [8]:
struct_log = pd.read_csv(params['struct_file'])
template_log = pd.read_csv(params['template_file'])
labels = pd.read_csv(params['label_file'])
struct_log['time_label'] = labels['time_label']
struct_log['line_label'] = labels['line_label']

In [9]:
for index, row in struct_log.iterrows():
        struct_log.loc[index, "Month"] = parse_month(row["Month"])

In [10]:
struct_log.sort_values(["Month", "Day", "Time"], inplace=True)

In [11]:
split_date = struct_log[(struct_log['line_label'] != "0") & (struct_log['time_label'] != "0")]
split_date = split_date.iloc[0]

In [12]:
train_set = pd.DataFrame(columns=["LineId", "Month", "Day", "Time", "Type", "Job", "Access", "Content", "EventId", "EventTemplate", "EventTemplateIdent", "ParameterList", "time_label", "line_label"])
test_set = pd.DataFrame(columns=["LineId", "Month", "Day", "Time", "Type", "Job", "Access", "Content", "EventId", "EventTemplate", "EventTemplateIdent", "ParameterList", "time_label", "line_label"])
if len(split_date) > 0:
    for index , row in struct_log.iterrows():
        if (row.Month < split_date.Month):
            train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
        elif (row["Month"] == split_date["Month"]):
            if (row["Day"] < split_date["Day"]):
                train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
            elif (row["Day"] == split_date["Day"]):
                if row["Time"] < split_date["Time"]:
                    train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
                else:
                    test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
            else:
               test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
        else:
           test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
            

In [13]:
print(train_set)
print(test_set)

        LineId Month Day      Time    Type      Job        Access  \
0            1     2  29  00:00:12  mail-0  dovecot    imap-login   
1            2     2  29  00:00:12  mail-0  dovecot   imap(karri)   
2            3     2  29  00:00:14  mail-0  dovecot    imap-login   
3            4     2  29  00:00:14  mail-0  dovecot   imap(karri)   
4            5     2  29  00:00:14  mail-0  dovecot   imap(karri)   
...        ...   ...  ..       ...     ...      ...           ...   
115438  115439     3   5  15:31:32    mail  dovecot  imap(bertie)   
115439  115440     3   5  15:31:32    mail  dovecot  imap(bertie)   
115440  115441     3   5  15:31:39    mail  dovecot    imap-login   
115441  115442     3   5  15:31:39    mail  dovecot   imap(karri)   
115442  115443     3   5  15:31:39    mail  dovecot   imap(karri)   

                                                  Content EventId  \
0       Login: user=<karri>, method=PLAIN, rip=127.0.0...      E1   
1                                

In [14]:
eventId_train = train_set.EventId.unique()
eventId_test = test_set.EventId.unique()

In [15]:
template_log_train = template_log[template_log["EventId"].isin(eventId_train)]
template_log_test = template_log[template_log["EventId"].isin(eventId_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

In [16]:
template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()

In [17]:
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [tokenizer.tokenize(sen) for sen in template_log_train_list]

template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
train_set["Token"] = train_set.EventId.map(
    lambda id: template_log_train[template_log_train.EventId == id].
    EventTemplateIdent_token.values[0]
)

In [19]:
# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to AIT_daemon_<Target>_word2Vec
trainWord2VecModelType2(token_train_list, "AIT_mail_mail.cup.com_word2Vec")

start train word2Vec model. . . . .
finish train word2Vec model . . . . . ^^


In [20]:
with open(os.path.join(data_dir, "train_set.pkl"), "wb") as fw:
    pickle.dump(train_set, fw)
with open(os.path.join(data_dir, "test_set.pkl"), "wb") as fw:
    pickle.dump(test_set, fw)
with open(os.path.join(data_dir, "template_train_set.pkl"), "wb") as fw:
    pickle.dump(template_log_train, fw)
with open(os.path.join(data_dir, "template_test_set.pkl"), "wb") as fw:
    pickle.dump(template_log_test, fw)