In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import RegexpTokenizer

import os
import pickle
from tqdm import tqdm

seed = 42
np.random.seed(seed)

Preprocess Data (Splited by time-series)

In [3]:
import re

#######
def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

csv_dir = "/content/drive/MyDrive/Colab Notebooks/Drain_result/"

struct_log = pd.read_csv( csv_dir + "BGL.log_structured.csv")
template_log = pd.read_csv( csv_dir + "BGL.log_templates.csv")

struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
struct_log.sort_values("Time", inplace=True)
struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()
split_date = struct_log[struct_log.Label == 1].Date.values[0]

trainset = struct_log[struct_log.Date < split_date]
testset = struct_log[struct_log.Date >= split_date] 
eventid_train = trainset.EventId.unique() 
eventid_test = testset.EventId.unique() 

In [4]:
template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
template_log_test = template_log[template_log["EventId"].isin(eventid_test)]

In [5]:
template_log_train["EventTemplate_cleansed"] = template_log_train.EventTemplate.map(text_cleansing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  template_log_train["EventTemplate_cleansed"] = template_log_train.EventTemplate.map(text_cleansing)


In [6]:
template_log_train_list = template_log_train["EventTemplate_cleansed"].astype('str').tolist()
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

template_log_train["EventTemplate_token"] = pd.Series(token_train_list)
map_token_train = { row[0]: row[1] \
    for row in tqdm(
        template_log_train[["EventId", "EventTemplate_token"]].values,
        desc="Mapping ID & token"
        ) }
trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  template_log_train["EventTemplate_token"] = pd.Series(token_train_list)
Mapping ID & token: 100%|██████████| 15/15 [00:00<00:00, 80763.23it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])


In [7]:
output_dir = "/content/drive/MyDrive/Colab Notebooks/BGL_preprocessed_type2/"

if not os.path.exists(output_dir + "/processed_<*>/"):
  os.mkdir(output_dir + "/processed_<*>")

output_dir = output_dir + "/processed_<*>"

In [8]:
template_log_train.to_pickle(output_dir + "/log_template.trainset.pkl")
trainset.to_pickle(output_dir + "/log_structured.trainset.pkl")

In [9]:
template_log_test["EventTemplate_cleansed"] = template_log_test.EventTemplate.map(text_cleansing)
template_log_test_list = template_log_test["EventTemplate_cleansed"].astype('str').tolist()

token_test_list = [ tokenizer.tokenize(sen) for sen in template_log_test_list ]

template_log_test["EventTemplate_token"] = pd.Series(token_test_list)
map_token_test = { row[0]: row[1] \
    for row in tqdm(
        template_log_test[["EventId", "EventTemplate_token"]].values,
        desc="Mapping ID & token"
        ) }
testset["Token"] = testset.EventId.map(lambda id: map_token_test[id])

Mapping ID & token: 100%|██████████| 1126/1126 [00:00<00:00, 620113.75it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testset["Token"] = testset.EventId.map(lambda id: map_token_test[id])


In [10]:
testset.to_pickle(output_dir + "/log_structured.testset.pkl")
template_log_test.to_pickle(output_dir + "/log_template.testset.pkl")

Preprocess Data (Splited by portion)

In [None]:
# from sklearn.model_selection import train_test_split

# struct_log = pd.read_csv("./output/BGL/BGL.log_structured.csv")
# template_log = pd.read_csv("./output/BGL/BGL.log_templates.csv")

# test_ratio = 0.4
# struct_log["Label"] = struct_log["Label"].apply(lambda x: int(x != "-"))
# struct_log.sort_values("Time", inplace=True)
# struct_log[struct_log["Label"] == 1].Date.value_counts().sort_index()

# trainset, testset = train_test_split(struct_log, test_size=0.4, random_state=seed, shuffle=False)
# trainset = trainset[trainset["Label"] == 0]
# eventid_train = trainset.EventId.unique()
# eventid_test = testset.EventId.unique()

# template_log_train = template_log[template_log["EventId"].isin(eventid_train)]
# template_log_test = template_log[template_log["EventId"].isin(eventid_test)]
# template_log_train["EventTemplateIdent_cleansed"] = template_log_train.EventTemplateIdent.map(text_cleansing)

In [None]:
# template_log_train_list = template_log_train["EventTemplateIdent_cleansed"].astype('str').tolist()

# tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|\w+')
# token_train_list = [ tokenizer.tokenize(sen) for sen in template_log_train_list ]

# template_log_train["EventTemplateIdent_token"] = pd.Series(token_train_list)
# map_token_train = { row[0]: row[1] \
#     for row in tqdm(
#         template_log_train[["EventId", "EventTemplateIdent_token"]].values,
#         desc="Mapping ID & token"
#         ) }
# trainset["Token"] = trainset.EventId.map(lambda id: map_token_train[id])

In [None]:
# template_log_train.to_pickle("./output/BGL/processed/log_template.trainset.pkl")
# trainset.to_pickle("./output/BGL/processed/log_structured.trainset.pkl")
# testset.to_pickle("./output/BGL/processed/log_structured.testset.pkl")
# template_log_test.to_pickle("./output/BGL/processed/log_structured.testset.pkl")

In [None]:
import gensim.downloader as api

info = api.info()
for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

In [None]:
# Download "GoogleNews-vectors-negative300.bin" from:
# https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300
# or load model from API
model_path = api.load("word2vec-google-news-300", return_path=True)
if os.path.exists(model_path):
  os.system("gzip -d {}".format(model_path))
model_file = os.path.dirname(model_path) + "/word2vec-google-news-300"



In [None]:
model = KeyedVectors.load_word2vec_format(model_file, binary = True)

embedder = Word2Vec(size=300, min_count=1)
# embedder = Word2Vec(vector_size=300, min_count=1)

embedder.build_vocab(token_train_list)
total_examples = embedder.corpus_count

embedder.build_vocab([list(model.vocab.keys())], update=True)
# embedder.build_vocab([list(model.key_to_index.keys())], update=True)

In [None]:
# embedder.wv.vectors_lockf = np.ones(len(embedder.wv), dtype=np.float32)
embedder.intersect_word2vec_format(model_file, binary=True)

In [None]:
embedder.train(token_train_list, total_examples=total_examples, epochs=10)



(1149, 1230)

In [None]:
fine_tune_files = "/content/drive/MyDrive/Colab Notebooks/word2Vec/BGL-word2vec-fine-tune-embedder-no-parameter-labeling.txt"
embedder.wv.save_word2vec_format(fine_tune_files, binary=False)