In [1]:
%%python --version

Python 3.10.11


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%env PYTHONHASHSEED=42

env: PYTHONHASHSEED=42


In [4]:
import gensim.downloader as api

info = api.info()
for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

In [5]:
# input
input_dir = "/content/drive/MyDrive/Colab Notebooks/Parsed_Log/"
dataset_prefix = "BGL."

cv = 1
cv_prefix = f"cv{cv}_" if cv is not None else ""

pkl_train_structured_path = input_dir + dataset_prefix + f"{cv_prefix}train.log_ident_structured.pkl"
pkl_train_template_path = input_dir + dataset_prefix + f"{cv_prefix}train.log_ident_templates.pkl"
pkl_train_2ndtemplate_path = input_dir + dataset_prefix + f"{cv_prefix}train.2nd_log_ident_templates.pkl"

pkl_test_structured_path = input_dir + dataset_prefix + f"{cv_prefix}test.log_ident_structured.pkl"
pkl_test_template_path = input_dir + dataset_prefix + f"{cv_prefix}test.log_ident_templates.pkl"
pkl_test_2ndtemplate_path = input_dir + dataset_prefix + f"{cv_prefix}test.2nd_log_ident_templates.pkl"

start_line_token = "<startline>"
end_line_token = "<endline>"

is_labeling = True
use_2ndstep = False

# output
output_dir = "/content/drive/MyDrive/Colab Notebooks/preprocessed/"
output_prefix = "withlinetoken.supervised.40-60." + \
    ( "nolabeling." if not is_labeling else ("2ndstep." if use_2ndstep else "") )
output_template_train_pkl = output_dir + dataset_prefix + output_prefix + f"{cv_prefix}template.trainset.pkl"
output_structured_train_pkl = output_dir + dataset_prefix + output_prefix + f"{cv_prefix}structured.trainset.pkl"
output_template_test_pkl = output_dir + dataset_prefix + output_prefix + f"{cv_prefix}template.testset.pkl"
output_structured_test_pkl = output_dir + dataset_prefix + output_prefix + f"{cv_prefix}structured.testset.pkl"

model_name = "word2vec-google-news-300"
embedder_dir = "/content/drive/MyDrive/Colab Notebooks/word2Vec/"
fine_tune_files = embedder_dir + dataset_prefix + output_prefix + cv_prefix + model_name + ".txt"

In [6]:
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
if model_name.startswith("word2vec"):
    from gensim.models import Word2Vec
elif model_name.startswith("fasttext"):
    from gensim.models import FastText

from nltk.tokenize import RegexpTokenizer

import gc
import os
import pickle
from tqdm import tqdm
tqdm.pandas()

seed = 42
np.random.seed(seed)

In [7]:
# Download "GoogleNews-vectors-negative300.bin" from:
# https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300
# or load model from API
model_path = api.load(model_name, return_path=True)
if os.path.exists(model_path):
  os.system("gzip -d {}".format(model_path))
model_file = os.path.dirname(model_path) + "/{}".format(model_name)

In [8]:
import re

#######
def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

In [9]:
trainset = pd.read_pickle(pkl_train_structured_path)
testset = pd.read_pickle(pkl_test_structured_path)

In [10]:
eventid_train = trainset.EventId.unique() 
eventid_test = testset.EventId.unique() 

if is_labeling:
  if use_2ndstep:
    template_log_train = pd.read_pickle(pkl_train_2ndtemplate_path)
    template_log_test = pd.read_pickle(pkl_test_2ndtemplate_path)
    template_log_col_name = "LabeledTemplate"
  else:
    template_log_train = pd.read_pickle(pkl_train_template_path)
    template_log_test = pd.read_pickle(pkl_test_template_path)
    template_log_col_name = "EventTemplateIdent"
else:
    template_log_train = pd.read_pickle(pkl_train_template_path)
    template_log_test = pd.read_pickle(pkl_test_template_path)
    template_log_col_name = "EventTemplate"

In [11]:
trainset["Template_cleansed"] = trainset[template_log_col_name].progress_apply(text_cleansing)
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|[\w<>]+')
trainset["Token"] = trainset["Template_cleansed"].progress_apply(
    lambda sen: tokenizer.tokenize(start_line_token + " " + sen + " " + end_line_token)
    )
trainset["Label"] = trainset["Label"].map(lambda x: 0 if x == '-' else 1)
trainset["Label"].value_counts()

100%|██████████| 3142329/3142329 [00:19<00:00, 161567.34it/s]
100%|██████████| 3142329/3142329 [00:22<00:00, 139362.23it/s]


0    2888232
1     254097
Name: Label, dtype: int64

In [12]:
template_log_train.to_pickle(output_template_train_pkl)
trainset.to_pickle(output_structured_train_pkl)
print("save pkl: structured, template = ({}, {})".format(len(trainset), len(template_log_train)))

save pkl: structured, template = (3142329, 8641)


In [13]:
testset["Template_cleansed"] = testset[template_log_col_name].progress_apply(text_cleansing)
testset["Token"] = testset["Template_cleansed"].progress_apply(
    lambda sen: tokenizer.tokenize(start_line_token + " " + sen + " " + end_line_token)
    )
testset["Label"] = testset["Label"].map(lambda x: 0 if x == '-' else 1)
testset["Label"].value_counts()

100%|██████████| 1571164/1571164 [00:10<00:00, 152482.07it/s]
100%|██████████| 1571164/1571164 [00:09<00:00, 162606.44it/s]


0    1543808
1      27356
Name: Label, dtype: int64

In [14]:
testset.to_pickle(output_structured_test_pkl)
template_log_test.to_pickle(output_template_test_pkl)

print("save pkl: structured, template = ({}, {})".format(len(testset), len(template_log_test)))

save pkl: structured, template = (1571164, 5657)


In [15]:
tokens = trainset["Token"].values

In [16]:
del trainset, template_log_train, testset, template_log_test
gc.collect()

0

In [17]:
model = KeyedVectors.load_word2vec_format(model_file, binary = True)

In [None]:
if model_name.startswith("word2vec"):
    embedder = Word2Vec(vector_size=300, min_count=1, workers=1, seed=42)
elif model_name.startswith("fasttext"):
    embedder = FastText(size=300, min_count=1, workers=1, seed=42)
# embedder = Word2Vec(vector_size=300, min_count=1)

embedder.build_vocab(tokens)
total_examples = embedder.corpus_count

# embedder.build_vocab([list(model.vocab.keys())], update=True)
embedder.build_vocab([list(model.key_to_index.keys())], update=True)

In [None]:
embedder.wv.vectors_lockf = np.ones(len(embedder.wv), dtype=np.float32)
embedder.wv.intersect_word2vec_format(model_file, binary=True)

In [None]:
embedder.train(tokens, total_examples=total_examples, epochs=10)

(207946073, 310376100)

In [None]:
embedder.wv.save_word2vec_format(fine_tune_files, binary=False)

In [None]:
fine_tune_files