In [1]:
%%python --version

Python 3.10.11


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%env PYTHONHASHSEED=42

env: PYTHONHASHSEED=42


In [4]:
import gensim.downloader as api

info = api.info()
for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

In [5]:
test_set_prefixes = ["spiral.com", "onion.com", "insect.com", "cup.com"]

In [6]:
import os

# input
input_dir = "/content/drive/MyDrive/Colab Notebooks/Parsed_Log/AIT"
dataset_prefix = "AIT."

ti = 3
test_prefix = test_set_prefixes[ti]

pkl_train_structured_path = list()
pkl_train_template_path = list()

pkl_test_structured_path = list()
pkl_test_template_path = list()

for (root,dirs,files) in os.walk(f"{input_dir}", topdown=True):
  for f in files:
    if f.startswith(f"{dataset_prefix}{test_prefix}"):
      if f.endswith("log_ident_structured.pkl"):
        pkl_test_structured_path.append(f"{root}/{f}")
      elif f.endswith("log_ident_templates.pkl"):
        pkl_test_template_path.append(f"{root}/{f}")
        
    else:
      if f.endswith("log_ident_structured.pkl"):
        pkl_train_structured_path.append(f"{root}/{f}")
      elif f.endswith("log_ident_templates.pkl"):
        pkl_train_template_path.append(f"{root}/{f}")

In [7]:
start_line_token = "<startline>"
end_line_token = "<endline>"

is_labeling = False

# output
output_dir = "/content/drive/MyDrive/Colab Notebooks/preprocessed/AIT/"
output_prefix = "supervised." + ( "nolabeling." if not is_labeling else "" )

output_structured_train_pkl = output_dir + dataset_prefix + output_prefix + test_prefix + "-as-testset." + "structured.trainset.pkl"
output_template_train_pkl = output_dir + dataset_prefix + output_prefix + test_prefix + "-as-testset." + "template.trainset.pkl"

output_structured_test_pkl = output_dir + dataset_prefix + output_prefix + test_prefix + "-as-testset." + "structured.testset.pkl"
output_template_test_pkl = output_dir + dataset_prefix + output_prefix + test_prefix + "-as-testset." + "template.testset.pkl"

model_name = "word2vec-google-news-300"
embedder_dir = "/content/drive/MyDrive/Colab Notebooks/word2Vec/"
fine_tune_files = embedder_dir + dataset_prefix + output_prefix + test_prefix + "-as-testset." + model_name + ".txt"

In [8]:
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
if model_name.startswith("word2vec"):
    from gensim.models import Word2Vec
elif model_name.startswith("fasttext"):
    from gensim.models import FastText

from nltk.tokenize import RegexpTokenizer

import gc
import os
import pickle
from tqdm import tqdm
tqdm.pandas()

seed = 42
np.random.seed(seed)

In [9]:
# Download "GoogleNews-vectors-negative300.bin" from:
# https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300
# or load model from API
model_path = api.load(model_name, return_path=True)
if os.path.exists(model_path):
  os.system("gzip -d {}".format(model_path))
model_file = os.path.dirname(model_path) + "/{}".format(model_name)

In [10]:
import re

#######
def text_cleansing(text):
    regex_except_token = r'\B(?!<\w+>\B)[^\w\s]'
    regex_expect_words = r'[^\w<>]+'
    output = re.sub(regex_except_token, '', text)
    output = re.sub(regex_expect_words, ' ', output)
    return output

In [11]:
trainset = dict()
for path in pkl_train_structured_path:
  fname = path.split("/")[-1]
  dsname = fname.split("-")[0][4:]
  ltname = fname.split("-")[-1][:-25]

  if dsname not in trainset:
    trainset[dsname] = dict()

  print(f"trainset: ({dsname}, {ltname}): {fname}")
  trainset[dsname][ltname] = pd.read_pickle(path)

trainset: (spiral.com, auth): AIT.spiral.com-auth.log_ident_structured.pkl
trainset: (spiral.com, daemon): AIT.spiral.com-daemon.log_ident_structured.pkl
trainset: (onion.com, daemon): AIT.onion.com-daemon.log_ident_structured.pkl
trainset: (spiral.com, user): AIT.spiral.com-user.log_ident_structured.pkl
trainset: (spiral.com, mail): AIT.spiral.com-mail.log_ident_structured.pkl
trainset: (spiral.com, access): AIT.spiral.com-mail.spiral.com-access.log_ident_structured.pkl
trainset: (onion.com, auth): AIT.onion.com-auth.log_ident_structured.pkl
trainset: (onion.com, user): AIT.onion.com-user.log_ident_structured.pkl
trainset: (onion.com, access): AIT.onion.com-mail.onion.com-access.log_ident_structured.pkl
trainset: (onion.com, mail): AIT.onion.com-mail.log_ident_structured.pkl
trainset: (insect.com, auth): AIT.insect.com-auth.log_ident_structured.pkl
trainset: (insect.com, daemon): AIT.insect.com-daemon.log_ident_structured.pkl
trainset: (insect.com, mail): AIT.insect.com-mail.log_ident

In [12]:
testset = dict()
for path in pkl_test_structured_path:
  fname = path.split("/")[-1]
  ltname = fname.split("-")[-1][:-24]

  print(f"testset: ({ltname}): {fname}")
  testset[ltname] = pd.read_pickle(path)

testset: (daemon.): AIT.cup.com-daemon.log_ident_structured.pkl
testset: (auth.): AIT.cup.com-auth.log_ident_structured.pkl
testset: (user.): AIT.cup.com-user.log_ident_structured.pkl
testset: (mail.): AIT.cup.com-mail.log_ident_structured.pkl
testset: (access.): AIT.cup.com-mail.cup.com-access.log_ident_structured.pkl


In [13]:
# eventid_train = trainset.EventId.unique() 
# eventid_test = testset.EventId.unique() 

template_log_col_name = "EventTemplateIdent" if is_labeling else "EventTemplate"

template_log_train = dict()
for path in pkl_train_template_path:
  fname = path.split("/")[-1]
  dsname = fname.split("-")[0][4:]
  ltname = fname.split("-")[-1][:-24]

  if dsname not in template_log_train:
    template_log_train[dsname] = dict()

  print(f"template trainset: ({dsname}, {ltname}): {fname}")
  template_log_train[dsname][ltname] = pd.read_pickle(path)

template_log_test = dict()
for path in pkl_test_template_path:
  fname = path.split("/")[-1]
  ltname = fname.split("-")[-1][:-24]

  print(f"template testset: ({ltname}): {fname}")
  template_log_test[ltname] = pd.read_pickle(path)

template trainset: (spiral.com, auth): AIT.spiral.com-auth.log_ident_templates.pkl
template trainset: (spiral.com, daemon): AIT.spiral.com-daemon.log_ident_templates.pkl
template trainset: (spiral.com, access): AIT.spiral.com-mail.spiral.com-access.log_ident_templates.pkl
template trainset: (spiral.com, user): AIT.spiral.com-user.log_ident_templates.pkl
template trainset: (spiral.com, mail): AIT.spiral.com-mail.log_ident_templates.pkl
template trainset: (onion.com, daemon): AIT.onion.com-daemon.log_ident_templates.pkl
template trainset: (onion.com, auth): AIT.onion.com-auth.log_ident_templates.pkl
template trainset: (onion.com, access): AIT.onion.com-mail.onion.com-access.log_ident_templates.pkl
template trainset: (onion.com, user): AIT.onion.com-user.log_ident_templates.pkl
template trainset: (onion.com, mail): AIT.onion.com-mail.log_ident_templates.pkl
template trainset: (insect.com, auth): AIT.insect.com-auth.log_ident_templates.pkl
template trainset: (insect.com, mail): AIT.insect.

In [14]:
tokenizer = RegexpTokenizer(r'[A-Z][a-z]+|[\w<>]+')

for w in trainset:
  for k in trainset[w]:
    print(f"{w}, {k}:")
    trainset[w][k]["Template_cleansed"] = trainset[w][k][template_log_col_name].progress_apply(text_cleansing)
    trainset[w][k]["Token"] = trainset[w][k]["Template_cleansed"].progress_apply(
        lambda sen: tokenizer.tokenize(start_line_token + " " + sen + " " + end_line_token)
        )
    trainset[w][k]["Label"] = trainset[w][k]["time_label"] | trainset[w][k]["line_label"]

spiral.com, auth:


100%|██████████| 1202/1202 [00:00<00:00, 39903.39it/s]
100%|██████████| 1202/1202 [00:00<00:00, 80825.21it/s]


spiral.com, daemon:


100%|██████████| 905/905 [00:00<00:00, 79904.12it/s]
100%|██████████| 905/905 [00:00<00:00, 102025.13it/s]


spiral.com, user:


100%|██████████| 23270/23270 [00:00<00:00, 31170.47it/s]
100%|██████████| 23270/23270 [00:00<00:00, 45163.55it/s]


spiral.com, mail:


100%|██████████| 59766/59766 [00:01<00:00, 54329.85it/s]
100%|██████████| 59766/59766 [00:00<00:00, 85058.74it/s]


spiral.com, access:


100%|██████████| 100445/100445 [00:03<00:00, 31301.06it/s]
100%|██████████| 100445/100445 [00:01<00:00, 54875.67it/s]


onion.com, daemon:


100%|██████████| 2133/2133 [00:00<00:00, 60629.65it/s]
100%|██████████| 2133/2133 [00:00<00:00, 143715.77it/s]


onion.com, auth:


100%|██████████| 1050/1050 [00:00<00:00, 110526.01it/s]
100%|██████████| 1050/1050 [00:00<00:00, 118470.41it/s]


onion.com, user:


100%|██████████| 16112/16112 [00:00<00:00, 27450.70it/s]
100%|██████████| 16112/16112 [00:00<00:00, 42375.74it/s]


onion.com, access:


100%|██████████| 81963/81963 [00:02<00:00, 28440.95it/s]
100%|██████████| 81963/81963 [00:00<00:00, 82899.91it/s]


onion.com, mail:


100%|██████████| 67873/67873 [00:01<00:00, 53927.80it/s]
100%|██████████| 67873/67873 [00:00<00:00, 91177.31it/s]


insect.com, auth:


100%|██████████| 1134/1134 [00:00<00:00, 54335.20it/s]
100%|██████████| 1134/1134 [00:00<00:00, 86093.85it/s]


insect.com, daemon:


100%|██████████| 2153/2153 [00:00<00:00, 40149.46it/s]
100%|██████████| 2153/2153 [00:00<00:00, 83649.08it/s]


insect.com, mail:


100%|██████████| 110016/110016 [00:02<00:00, 54074.38it/s]
100%|██████████| 110016/110016 [00:02<00:00, 52150.88it/s]


insect.com, access:


100%|██████████| 169340/169340 [00:06<00:00, 24681.00it/s]
100%|██████████| 169340/169340 [00:01<00:00, 108343.64it/s]


insect.com, user:


100%|██████████| 37868/37868 [00:01<00:00, 27221.07it/s]
100%|██████████| 37868/37868 [00:00<00:00, 56330.91it/s]


In [15]:
with open(output_structured_train_pkl, 'wb') as trainset_pkl:
    pickle.dump(trainset, trainset_pkl, protocol=pickle.HIGHEST_PROTOCOL)
print(f"save: {output_structured_train_pkl}")

with open(output_template_train_pkl, 'wb') as template_train_pkl:
    pickle.dump(template_log_train, template_train_pkl, protocol=pickle.HIGHEST_PROTOCOL)
print(f"save: {output_template_train_pkl}")

save: /content/drive/MyDrive/Colab Notebooks/preprocessed/AIT/AIT.supervised.nolabeling.cup.com-as-testset.structured.trainset.pkl
save: /content/drive/MyDrive/Colab Notebooks/preprocessed/AIT/AIT.supervised.nolabeling.cup.com-as-testset.template.trainset.pkl


In [16]:
for k in testset:
  testset[k]["Template_cleansed"] = testset[k][template_log_col_name].progress_apply(text_cleansing)
  testset[k]["Token"] = testset[k]["Template_cleansed"].progress_apply(
      lambda sen: tokenizer.tokenize(start_line_token + " " + sen + " " + end_line_token)
      )
  testset[k]["Label"] = testset[k]["time_label"] | testset[k]["line_label"]

100%|██████████| 930/930 [00:00<00:00, 91244.51it/s]
100%|██████████| 930/930 [00:00<00:00, 103466.92it/s]
100%|██████████| 1224/1224 [00:00<00:00, 75131.76it/s]
100%|██████████| 1224/1224 [00:00<00:00, 153864.06it/s]
100%|██████████| 34329/34329 [00:01<00:00, 28590.45it/s]
100%|██████████| 34329/34329 [00:00<00:00, 53742.46it/s]
100%|██████████| 122813/122813 [00:02<00:00, 58395.81it/s]
100%|██████████| 122813/122813 [00:01<00:00, 99279.02it/s] 
100%|██████████| 148534/148534 [00:05<00:00, 26312.67it/s]
100%|██████████| 148534/148534 [00:03<00:00, 47950.56it/s]


In [17]:
with open(output_structured_test_pkl, 'wb') as testset_pkl:
    pickle.dump(testset, testset_pkl, protocol=pickle.HIGHEST_PROTOCOL)
print(f"save: {output_structured_test_pkl}")

with open(output_template_test_pkl, 'wb') as template_test_pkl:
    pickle.dump(template_log_test, template_test_pkl, protocol=pickle.HIGHEST_PROTOCOL)
print(f"save: {output_template_test_pkl}")

save: /content/drive/MyDrive/Colab Notebooks/preprocessed/AIT/AIT.supervised.nolabeling.cup.com-as-testset.structured.testset.pkl
save: /content/drive/MyDrive/Colab Notebooks/preprocessed/AIT/AIT.supervised.nolabeling.cup.com-as-testset.template.testset.pkl


In [18]:
tokens = None
for w in trainset:
  for k in trainset[w]:
    if tokens is None: tokens = trainset[w][k]["Token"].values
    else: tokens = np.append(tokens, trainset[w][k]["Token"].values)

In [19]:
tokens.shape

(675230,)

In [20]:
del trainset, template_log_train, testset, template_log_test
gc.collect()

0

In [21]:
model = KeyedVectors.load_word2vec_format(model_file, binary = True)

In [22]:
if model_name.startswith("word2vec"):
    embedder = Word2Vec(vector_size=300, min_count=1, workers=1, seed=42)
elif model_name.startswith("fasttext"):
    embedder = FastText(size=300, min_count=1, workers=1, seed=42)
# embedder = Word2Vec(vector_size=300, min_count=1)

embedder.build_vocab(tokens)
total_examples = embedder.corpus_count

# embedder.build_vocab([list(model.vocab.keys())], update=True)
embedder.build_vocab([list(model.key_to_index.keys())], update=True)

In [23]:
embedder.wv.vectors_lockf = np.ones(len(embedder.wv), dtype=np.float32)
embedder.wv.intersect_word2vec_format(model_file, binary=True)

In [24]:
embedder.train(tokens, total_examples=total_examples, epochs=10)

(85928510, 104328280)

In [25]:
embedder.wv.save_word2vec_format(fine_tune_files, binary=False)

In [26]:
fine_tune_files

'/content/drive/MyDrive/Colab Notebooks/word2Vec/AIT.supervised.nolabeling.cup.com-as-testset.word2vec-google-news-300.txt'