In [1]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import sys

sys.path.append("../../data_preprocess/")

import pickle
import re
from utils import json_pretty_dump, word2VecContinueLearning, trainWord2VecModelType2, tokenizeData, convertWord2Vec, text_cleansing, parse_datetime, parse_month
seed = 42
np.random.seed(seed)

# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../processed_type2/AIT_preprocessed_type2/auth/<Target>
data_dir = "../processed_type2/AIT_preprocessed_type2/auth/mail.cup.com"

label_dir = [
    "mail.cup.com",
    "mail.insect.com",
    "mail.onion.com",
    "mail.spiral.com",
]

# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../../Drain_result/AIT_auth_<target>/auth.log_structured.csv
# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to ../../Drain_result/AIT_auth_<target>/auth.log_templates.csv
params = {
    "struct_file": "../../Drain_result/AIT_auth_mail.cup.com/auth.log_structured.csv",
    "template_file": "../../Drain_result/AIT_auth_mail.cup.com/auth.log_templates.csv",
    "label_file": f"../../AIT-LDS-v1_1/labels/{label_dir[0]}/apache2/mail.cup.com-access.log",
}

os.makedirs(data_dir, exist_ok=True)

In [2]:
struct_log = pd.read_csv(params['struct_file'])
template_log = pd.read_csv(params['template_file'])
labels = pd.read_csv(params['label_file'])
struct_log['time_label'] = labels['time_label']
struct_log['line_label'] = labels['line_label']

In [3]:
for index, row in struct_log.iterrows():
        struct_log.loc[index, "Month"] = parse_month(row["Month"])

In [4]:
struct_log.sort_values(["Month", "Day", "Time"], inplace=True)

In [5]:
split_date = struct_log[(struct_log['line_label'] != "0") & (struct_log['time_label'] != "0")]
split_date = split_date.iloc[0]
    

Unnamed: 0,LineId,Month,Day,Time,Type,Job,Pam_unix,Content,EventId,EventTemplate,EventTemplateIdent,ParameterList,time_label,line_label
0,1,2,29,00:09:01,mail-0,CRON[32002],pam_unix(cron:session),session opened for user root by (uid=0),E1,session opened for user root by (uid=<*>),session opened for user root by (uid=<Numbers>),['0'],0,0
1,2,2,29,00:09:01,mail-0,CRON[32002],pam_unix(cron:session),session closed for user root,E2,session closed for user root,session closed for user root,[],0,0
2,3,2,29,00:17:01,mail-0,CRON[32209],pam_unix(cron:session),session opened for user root by (uid=0),E1,session opened for user root by (uid=<*>),session opened for user root by (uid=<Numbers>),['0'],0,0
3,4,2,29,00:17:01,mail-0,CRON[32209],pam_unix(cron:session),session closed for user root,E2,session closed for user root,session closed for user root,[],0,0
4,5,2,29,00:39:01,mail-0,CRON[32448],pam_unix(cron:session),session opened for user root by (uid=0),E1,session opened for user root by (uid=<*>),session opened for user root by (uid=<Numbers>),['0'],0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219,1220,3,6,04:09:01,mail,CRON[9881],pam_unix(cron:session),session closed for user root,E2,session closed for user root,session closed for user root,[],0,0
1220,1221,3,6,04:17:01,mail,CRON[9957],pam_unix(cron:session),session opened for user root by (uid=0),E1,session opened for user root by (uid=<*>),session opened for user root by (uid=<Numbers>),['0'],0,0
1221,1222,3,6,04:17:01,mail,CRON[9957],pam_unix(cron:session),session closed for user root,E2,session closed for user root,session closed for user root,[],0,0
1222,1223,3,6,04:39:01,mail,CRON[10003],pam_unix(cron:session),session opened for user root by (uid=0),E1,session opened for user root by (uid=<*>),session opened for user root by (uid=<Numbers>),['0'],0,0


In [6]:
train_set = pd.DataFrame(columns=["LineId", "Month", "Day", "Time", "Type", "Job", "Pam_unix", "Content", "EventId", "EventTemplate", "EventTemplateIdent", "ParameterList", "time_label", "line_label"])
test_set = pd.DataFrame(columns=["LineId", "Month", "Day", "Time", "Type", "Job", "Pam_unix", "Content", "EventId", "EventTemplate", "EventTemplateIdent", "ParameterList", "time_label", "line_label"])
if len(split_date) > 0:
    for index , row in struct_log.iterrows():
        if (row.Month < split_date.Month):
            train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
        elif (row["Month"] == split_date["Month"]):
            if (row["Day"] < split_date["Day"]):
                train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
            elif (row["Day"] == split_date["Day"]):
                if row["Time"] < split_date["Time"]:
                    train_set = pd.concat([train_set, pd.DataFrame([row])], ignore_index=True)
                else:
                    test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
            else:
               test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
        else:
           test_set = pd.concat([test_set, pd.DataFrame([row])], ignore_index=True)
            

In [8]:
eventId_train = train_set.EventId.unique()
eventId_test = test_set.EventId.unique()

In [9]:
template_log_train = template_log[template_log["EventId"].isin(eventId_train)]
template_log_test = template_log[template_log["EventId"].isin(eventId_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

In [10]:
template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()

In [11]:
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [tokenizer.tokenize(sen) for sen in template_log_train_list]

template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
train_set["Token"] = train_set.EventId.map(
    lambda id: template_log_train[template_log_train.EventId == id].
    EventTemplateIdent_token.values[0]
)

In [12]:
# [mail.cup.com, mail.insect.com, mail.onion.com, mail.spiral.com] => choose one of this to AIT_daemon_<Target>_word2Vec
trainWord2VecModelType2(token_train_list, "AIT_auth_mail.cup.com_word2Vec")

start train word2Vec model. . . . .
finish train word2Vec model . . . . . ^^


In [13]:
with open(os.path.join(data_dir, "train_set.pkl"), "wb") as fw:
    pickle.dump(train_set, fw)
with open(os.path.join(data_dir, "test_set.pkl"), "wb") as fw:
    pickle.dump(test_set, fw)
with open(os.path.join(data_dir, "template_train_set.pkl"), "wb") as fw:
    pickle.dump(template_log_train, fw)
with open(os.path.join(data_dir, "template_test_set.pkl"), "wb") as fw:
    pickle.dump(template_log_test, fw)