## Import Library

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from timeit import default_timer as timer

In [2]:
hdfs_data_dir = "datasets/hdfs/output/"
hdfs_file = "hdfs_labeled.csv"
hdfs_content_file = "hdfs_content_labeled.csv"
hdfs_output_dir = "tokenizer/hdfs"

bgl_data_dir = "datasets/bgl/output/"
bgl_file = "bgl_time_windowed.csv"
bgl_content_file = "bgl_time_windowed_content.csv"
bgl_output_dir = "tokenizer/bgl"

tbird_data_dir = "datasets/tbird/output/"
tbird_file = "tbird_time_windowed_5M.csv"
tbird_content_file = "tbird_time_windowed_5M_content.csv"
tbird_output_dir = "tokenizer/tbird"

## Generate Tokenizer

In [3]:
def generate_tokenizer(log_dir, log_file, output_dir, token, mode=None, content=False, vocab_size = 50265):
    start = timer()
    df = pd.read_csv(log_dir + log_file)
    if mode == 'hdfs':
        if content:
            df = df[["ContentSequence"]]
            df.rename(columns={'ContentSequence': 'text'}, inplace=True)
        else :
            df = df[["EventSequence"]]
            df.rename(columns={'EventSequence': 'text'}, inplace=True)
    else :
        if content:
            df = df[["EventTemplate"]]
            df.rename(columns={'EventTemplate': 'text'}, inplace=True)
        else :
            df = df[["EventId"]]
            df.rename(columns={'EventId': 'text'}, inplace=True)
    
    dataset = Dataset.from_pandas(df)
    batch_size = 1000
    
    def batch_iterator():
        for i in range(0, len(dataset), batch_size):
            yield dataset[i : i + batch_size]["text"]

    tokenizer = token.train_new_from_iterator(batch_iterator(), vocab_size)
    tokenizer.save_pretrained(output_dir)
    print("tokenizer saved at : " + output_dir)
    end = timer()
    print("time elapsed : {:.2f}s".format(end-start)) 

In [4]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")



### HDFS

In [5]:
generate_tokenizer(hdfs_data_dir, hdfs_file, hdfs_output_dir, tokenizer, mode='hdfs')

tokenizer saved at : tokenizer/hdfs
time elapsed : 45.70s


In [6]:
generate_tokenizer(hdfs_data_dir, hdfs_content_file, hdfs_output_dir + "/content", tokenizer, mode='hdfs', content=True)

tokenizer saved at : tokenizer/hdfs/content
time elapsed : 110.74s


## BGL

In [7]:
generate_tokenizer(bgl_data_dir, bgl_file, bgl_output_dir, tokenizer)

tokenizer saved at : tokenizer/bgl
time elapsed : 10.88s


In [8]:
generate_tokenizer(bgl_data_dir, bgl_content_file, bgl_output_dir + "/content", tokenizer, content=True)

tokenizer saved at : tokenizer/bgl/content
time elapsed : 32.81s


### Thunderbird

In [9]:
generate_tokenizer(tbird_data_dir, tbird_file, tbird_output_dir, tokenizer)

tokenizer saved at : tokenizer/tbird
time elapsed : 25.12s


In [10]:
generate_tokenizer(tbird_data_dir, tbird_content_file, tbird_output_dir + "/content", tokenizer, content=True)

tokenizer saved at : tokenizer/tbird/content
time elapsed : 139.66s


## Sandbox

In [3]:
df = pd.read_csv("datasets/hdfs/output/hdfs_labeled.csv")

In [3]:
df = df.drop("BlockId", axis='columns')
df = df.drop("Label", axis='columns')

In [4]:
df = df[["EventSequence"]]

In [5]:
df.rename(columns={'EventSequence': 'text'}, inplace=True)

In [6]:
df.head()

Unnamed: 0,text
0,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3..."
1,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3..."
2,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3..."
3,"['9b7aa7a3', '81358cb3', '9b7aa7a3', '9b7aa7a3..."
4,"['9b7aa7a3', '9b7aa7a3', '81358cb3', '9b7aa7a3..."


In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
dataset

Dataset({
    features: ['text'],
    num_rows: 575061
})

In [8]:
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

In [9]:
def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [10]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")



In [11]:
tokenizer.is_fast

True

In [13]:
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(),vocab_size = 50265)

In [14]:
new_tokenizer(dataset[1]["text"])

{'input_ids': [0, 284, 29, 70, 27, 263, 27, 69, 23, 261, 262, 29, 70, 27, 263, 27, 69, 23, 261, 262, 286, 285, 23, 261, 262, 29, 70, 27, 263, 27, 69, 23, 261, 262, 22, 73, 21, 268, 20, 263, 261, 262, 271, 70, 29, 71, 269, 261, 262, 22, 73, 21, 268, 20, 263, 261, 262, 271, 70, 29, 71, 269, 261, 262, 22, 73, 21, 268, 20, 263, 261, 262, 271, 70, 29, 71, 269, 261, 262, 22, 74, 267, 71, 266, 261, 262, 22, 74, 267, 71, 266, 261, 262, 22, 74, 267, 71, 266, 261, 262, 26, 289, 25, 287, 261, 262, 72, 295, 71, 26, 261, 262, 72, 295, 71, 26, 261, 262, 264, 280, 279, 261, 262, 264, 280, 279, 261, 262, 264, 280, 279, 261, 262, 70, 277, 261, 262, 70, 277, 261, 262, 70, 277, 281, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [15]:
new_tokenizer.save_pretrained("tokenizer/hdfs")

('tokenizer/hdfs\\tokenizer_config.json',
 'tokenizer/hdfs\\special_tokens_map.json',
 'tokenizer/hdfs\\vocab.json',
 'tokenizer/hdfs\\merges.txt',
 'tokenizer/hdfs\\added_tokens.json',
 'tokenizer/hdfs\\tokenizer.json')