In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer
import os

seed = 42
np.random.seed(seed)

Preprocess Data (Splited by portion)

In [None]:
import re

def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

struct_log = pd.read_csv("./output/BGL/BGL.log_structured.csv")
template_log = pd.read_csv("./output/BGL/BGL.log_templates.csv")

struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values("Time", inplace=True)
struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()
split_date = struct_log[struct_log.Label == 1].Date.values[0]

trainset = struct_log[struct_log.Date < split_date]
testset = struct_log[struct_log.Date >= split_date]
eventid_train = trainset.EventId.unique()
eventid_test = testset.EventId.unique()

template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
template_log_test = template_log[template_log["EventId"].isin(eventid_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()
tokenizer = RegexpTokenizer(r'\w+')
token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

Preprocess Data (Splited by timeseries)

In [None]:
from sklearn.model_selection import train_test_split

def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

struct_log = pd.read_csv("./output/BGL/BGL.log_structured.csv")
template_log = pd.read_csv("./output/BGL/BGL.log_templates.csv")

test_ratio = 0.4
struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values("Time", inplace=True)
struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()

trainset, testset = train_test_split(struct_log, test_size=0.4, random_state=seed, shuffle=False)
trainset = trainset[trainset["Label"] == 0]
eventid_train = trainset.EventId.unique()
eventid_test = testset.EventId.unique()

template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
template_log_test = template_log[template_log["EventId"].isin(eventid_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()
tokenizer = RegexpTokenizer(r'\w+')
token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

In [3]:
model = KeyedVectors.load_word2vec_format(
    "../models/GoogleNews-vectors-negative300.bin",
    binary = True
    )

embedder = Word2Vec(size=300, min_count=1)
embedder.build_vocab(token_train_list)
total_examples = embedder.corpus_count
embedder.build_vocab([list(model.vocab.keys())], update=True)

In [None]:
embedder.intersect_word2vec_format("../models/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
embedder.train(token_train_list, total_examples=total_examples, epochs=embedder.iter)

In [None]:
embedder.save_word2vec_format("../models/BGL-fine-tune-embedder.txt", binary=False)