In [17]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import pickle
from tqdm import tqdm

seed = 42
np.random.seed(seed)

Preprocess Data (Splited by portion)

In [2]:
import re

def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

struct_log = pd.read_csv("./output/BGL/BGL.log_structured.csv")
template_log = pd.read_csv("./output/BGL/BGL.log_templates.csv")

struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values("Time", inplace=True)
struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()
split_date = struct_log[struct_log.Label == 1].Date.values[0]

trainset = struct_log[struct_log.Date < split_date]
testset = struct_log[struct_log.Date >= split_date]
eventid_train = trainset.EventId.unique()
eventid_test = testset.EventId.unique()

template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
template_log_test = template_log[template_log["EventId"].isin(eventid_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)


In [19]:
template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
map_token_train = { row[0]: row[1] \
    for row in tqdm(
        template_log_train[["EventId", "EventTemplateIdent_token"]].values,
        desc="Mapping ID & token"
        ) }
trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
Mapping ID & token: 100%|██████████| 15/15 [00:00<00:00, 86659.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])


In [20]:
template_log_train.to_pickle("./output/BGL/processed/log_template.trainset.pkl")
trainset.to_pickle("./output/BGL/processed/log_structured.trainset.pkl")
testset.to_pickle("./output/BGL/processed/log_structured.testset.pkl")
template_log_test.to_pickle("./output/BGL/processed/log_structured.testset.pkl")

Preprocess Data (Splited by timeseries)

In [None]:
from sklearn.model_selection import train_test_split

def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

struct_log = pd.read_csv("./output/BGL/BGL.log_structured.csv")
template_log = pd.read_csv("./output/BGL/BGL.log_templates.csv")

test_ratio = 0.4
struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values("Time", inplace=True)
struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()

trainset, testset = train_test_split(struct_log, test_size=0.4, random_state=seed, shuffle=False)
trainset = trainset[trainset["Label"] == 0]
eventid_train = trainset.EventId.unique()
eventid_test = testset.EventId.unique()

template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
template_log_test = template_log[template_log["EventId"].isin(eventid_test)]
template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

In [None]:
template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()

tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
map_token_train = { row[0]: row[1] \
    for row in tqdm(
        template_log_train[["EventId", "EventTemplateIdent_token"]].values,
        desc="Mapping ID & token"
        ) }
trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])

In [None]:
template_log_train.to_pickle("./output/BGL/processed/log_template.trainset.pkl")
trainset.to_pickle("./output/BGL/processed/log_structured.trainset.pkl")
testset.to_pickle("./output/BGL/processed/log_structured.testset.pkl")
template_log_test.to_pickle("./output/BGL/processed/log_structured.testset.pkl")

In [21]:
# Download "GoogleNews-vectors-negative300.bin" from:
# https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300

model = KeyedVectors.load_word2vec_format(
    "../models/GoogleNews-vectors-negative300.bin",
    binary = True
    )

embedder = Word2Vec(size=300, min_count=1)
embedder.build_vocab(token_train_list)
total_examples = embedder.corpus_count
embedder.build_vocab([list(model.vocab.keys())], update=True)



In [173]:
embedder.intersect_word2vec_format("../models/GoogleNews-vectors-negative300.bin", binary=True)

In [22]:
embedder.train(token_train_list, total_examples=total_examples, epochs=embedder.iter)

  embedder.train(token_train_list, total_examples=total_examples, epochs=embedder.iter)


(303, 735)

In [23]:
embedder.wv.save_word2vec_format("../models/BGL-fine-tune-embedder.txt", binary=False)