In [2]:
import tensorflow as tf
import numpy as np
import logging
logging.getLogger('tensorflow').disabled = True

tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

2023-01-06 17:03:37.714412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
def batch_iter(data, batch_size, num_epochs, shuffle=False):
    """
    Generates a batch iterator for a dataset.
    """
    data_size = len(data)
    num_batches_per_epoch = int((data_size - 1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
            
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [51]:
import datetime
import os
import time

from tqdm import tqdm

class TextCNN():
    
    def __init__(
        self,
        session,
        num_classes,
        vocab_size,
        embedding_size, 
        filter_sizes, 
        num_filters,
        sequence_length=None,
        l2_reg_lambda=0.0,
        seed=42,
        ) -> None:
        
        self.sess = session
        if seed is not None: tf.random.set_seed(seed)
        self.input_words = tf.compat.v1.placeholder(tf.dtypes.int32, shape=(None, sequence_length), name="input_words_idx")
        self.input_labels = tf.compat.v1.placeholder(tf.dtypes.float32, shape=(None, num_classes), name="input_label")
        self.dropout_keep_prob = tf.compat.v1.placeholder(tf.dtypes.float32, name="dropout_keep_prob")
        
        l2_loss = tf.constant(0.0, dtype=tf.dtypes.float32)
        
        with tf.device('/cpu:0'), tf.name_scope("embedding"): 
            self.embedding_dictionary = tf.Variable(
                tf.random.uniform(shape=(vocab_size, embedding_size), minval=-1., maxval=1.),
                trainable=True,
                name="embedding_dictionary"
            )
            self.embedded_words = tf.nn.embedding_lookup(self.embedding_dictionary, self.input_words)
            self.embedded_words_expanded = tf.expand_dims(self.embedded_words, -1)
            
        pooled_output = list()
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope(f"conv-maxpool-{filter_size}"):
                filter_shape = (filter_size, embedding_size, 1, num_filters)
                kernels = tf.Variable(
                    tf.random.truncated_normal(filter_shape, stddev=0.1),
                    name=f"kernels-{filter_size}"
                )
                bias = tf.Variable(
                    tf.constant(0.1, shape=(num_filters,)),
                    name=f"bias-kernels-{filter_size}"
                )
                conv = tf.nn.conv2d(
                    self.embedded_words_expanded,
                    filters=kernels,
                    strides=(1, 1, 1, 1),
                    padding="VALID",
                    name=f"conv-{filter_size}"
                )
                
                feature_maps = tf.nn.relu(
                    tf.nn.bias_add(conv, bias),
                    name=f"relu-conv-{filter_size}"
                )
                
                if sequence_length is not None:
                    pooled = tf.nn.max_pool(
                        feature_maps,
                        ksize=[1, sequence_length - filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name="pool"
                    )
                else:
                    pooled = tf.math.reduce_max(
                        feature_maps,
                        axis=1,
                        keepdims=True,
                        name=f"global-max-pooling-conv-{filter_size}"
                    )
                
                pooled_output.append(pooled)
        
        total_num_filters = num_filters * len(filter_sizes)
        self.pooled_feature = tf.concat(pooled_output, axis=3)
        self.pooled_feature_flat = tf.reshape(self.pooled_feature, shape=(-1, total_num_filters))
        
        with tf.name_scope("dropout"):
            self.feature_drop = tf.nn.dropout(self.pooled_feature_flat, self.dropout_keep_prob)
            
        with tf.name_scope("output"):
            weight = tf.compat.v1.get_variable(
                "fc-weight",
                shape=(total_num_filters, num_classes),
                initializer=tf.initializers.glorot_uniform()
            )
            bias = tf.Variable(tf.constant(0.1, shape=(num_classes,), name="fc-bias"))
            l2_loss += tf.nn.l2_loss(weight)
            l2_loss += tf.nn.l2_loss(bias)
            self.fc_output = tf.nn.bias_add(tf.matmul(self.feature_drop, weight, name="fc-weight"), bias=bias, name="fc-bias")
            self.predictions = tf.argmax(self.fc_output, axis=1, name="predictions")
            
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.fc_output, labels=self.input_labels)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_labels, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
            
    def fit(self, X, y,
            num_epochs=100, 
            batch_size=1, 
            checkpoint_every=None, 
            # keep_summaries=True,
            eval_X = None,
            eval_y = None,
            ):

            assert not ((eval_X is None) ^ (eval_y is None)),\
                "evaluation set must be either None or not None "
                
            if eval_X is None: eval_X = X
            if eval_y is None: eval_y = y
            
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            # self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
            self.optimizer = tf.compat.v1.train.AdamOptimizer(1e-3)
            grads_and_vars = self.optimizer.compute_gradients(self.loss)
            self.train_ops = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)
            
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print(f"Writing to {out_dir}")
            
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            self.train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
            
            with self.train_summary_writer.as_default():
                
                # if keep_summaries:
                grad_summaries = []
                for g, v in grads_and_vars:
                    if g is not None:
                        grad_hist_summary = tf.compat.v1.summary.histogram(f"{v.name}/grad/hist", g)
                        sparsity_summary = tf.compat.v1.summary.scalar(f"{v.name}/grad/sparsity", tf.nn.zero_fraction(g))
                        grad_summaries.append(grad_hist_summary)
                        grad_summaries.append(sparsity_summary)
                grad_summaries_merged = tf.compat.v1.summary.merge(grad_summaries)
                
                loss_summary = tf.compat.v1.summary.scalar("loss", self.loss)
                acc_summary = tf.compat.v1.summary.scalar("accuracy", self.accuracy)
                self.train_summary_op = tf.compat.v1.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
                    
                checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
                checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())

                self.sess.run(tf.compat.v1.initialize_all_variables())
                
                batches = batch_iter(list(zip(X, y)), batch_size=batch_size, num_epochs=num_epochs)
                iter_num = (int((len(X)-1)/batch_size) + 1) * num_epochs
                if checkpoint_every is None: checkpoint_every = iter_num - 1
                
                batch_summary = list()
                for batch in tqdm(batches, total=iter_num, desc=f"traing (batch_size={batch_size}, max_epochs={num_epochs}) :"):
                    x_batch, y_batch = zip(*batch)
                    
                    feed_dict = {
                        self.input_words: x_batch,
                        self.input_labels: y_batch,
                        self.dropout_keep_prob: 0.5
                    }
                    _, summary, current_step = self.sess.run([
                                                    self.train_ops,
                                                    self.train_summary_op,
                                                    self.global_step,
                                                    ], feed_dict=feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    batch_summary.append(summary)
                    if (current_step + 1) % checkpoint_every == 0:
                        path = saver.save(self.sess, checkpoint_prefix, global_step=current_step)
                        print(f"Saved model checkpoint to {path}")
                        
                        preds, loss, acc = self.predict(eval_X, eval_y)
                        print(f"{time_str}:{current_step}:: evaluation result: loss = {loss}, acc = {acc}")
                        
                return batch_summary
    
    def predict(self, X, y=None):
        feed_dict = {
                    self.input_words: X,
                    self.dropout_keep_prob: 1
                }
        pipeline = [self.predictions]
        
        if y is not None:
            feed_dict[self.input_labels] = y
            pipeline.extend([self.loss, self.accuracy])

            preds, loss, acc = self.sess.run(pipeline, feed_dict=feed_dict)
            return preds, loss, acc
        else:
            preds = self.sess.run(pipeline, feed_dict=feed_dict)    
            return preds, None, None
        

In [52]:
from typing import NamedTuple

class DataPreprocessor(object):
    
    def __init__(self, record_unknow=False) -> None:
        if record_unknow: self.unknow_words = dict()
    
    def load_word2vec_format(self, 
                             file_path, 
                             unknow_token=None, 
                             unknow_repr=None):
        with open(file_path, "r") as f:
            vec = dict()
            for l in f.readlines():
                data = l.split()
                vec[data[0]] = np.array(data[1:], dtype=np.float32)
            
            f.close()
                
        embedding_size = vec.pop(list(vec.keys())[0])
        if unknow_token is not None:
            try:
                unknow_vec = vec[unknow_token]
            except AttributeError:
                print(f"there's not '{unknow_token}' in dictionary.")
                if unknow_repr:
                    assert len(unknow_repr) == embedding_size, \
                        f"unknown represent vector must same shape with embedding size (expected {embedding_size}, got {len(unknow_repr)})"
                else:
                    unknow_repr = [ 0 for _ in range(embedding_size) ]
                vec[unknow_token] = unknow_repr
            self.unknow_token = unknow_token
        else: self.unknow_token = None
        
        vocab_size = len(vec.keys())    
        self.word_vectors = np.array(list(vec.values()))
        self.words_indices = { word: i for i, word in enumerate(vec.keys()) }
        self.embedding_shape = NamedTuple(vocab_size=vocab_size, embedding_size=embedding_size)
    
    def indice_encode(self, line):
        record_unknow = hasattr(self, "unknow_words")
        words = line.split()
        encoded = list()
        for word in words:
            try:
                i = self.words_indices[word]
            except AttributeError:
                if record_unknow:
                    if word in self.unknow_words.keys(): self.unknow_words[word] += 1
                    else: self.unknow_words[word] = 0
                
                if self.unknow_token is None: continue
                else: i = self.words_indices[self.unknow_token]
            encoded.append(i)
            
        return encoded
    
    def indice_padding(self, batch, padding_token, padding_size=0):
        padding_idx = self.words_indices[padding_token]
        seq_length = np.array([ len(inst) for inst in batch ])
        max_length = seq_length.max()
        padding_size = max(padding_size, max_length)
        
        padding_batch = np.array([ np.pad(inst, (0, padding_size - len(inst)), constant_values=padding_idx) for inst in batch ])
        return padding_batch

In [53]:
input_size = 10000
rand_array = [ 
            [np.random.randint(0, 20) for _ in range(20)] \
                for _ in range(input_size)
            ]
# rand_array = [[11, 6, 5, 2, 4, 12], [4, 6, 6, 16, 0, 9], [5, 19, 10, 5, 1, 14]]
rand_label = np.array([ [1., 0.] \
    # if np.random.randint(0, 2) == 0 else [0., 1.] \
        for _ in range(input_size) ])

In [54]:
eval_size = int(input_size * 0.2)
eval_arr = [ 
            [np.random.randint(0, 20) for _ in range(20)] \
                for _ in range(eval_size)
            ]
# rand_array = [[11, 6, 5, 2, 4, 12], [4, 6, 6, 16, 0, 9], [5, 19, 10, 5, 1, 14]]
eval_label = np.array([ [1., 0.] \
    if np.random.randint(0, 2) == 0 else [0., 1.] \
        for _ in range(eval_size) ])

In [57]:
graph = tf.Graph()
with graph.as_default():
    allow_soft_placement=True
    log_device_placement=False
    
    session_conf = tf.compat.v1.ConfigProto(
    allow_soft_placement=allow_soft_placement,
    log_device_placement=log_device_placement)
    sess = tf.compat.v1.Session(config=session_conf)
    
    with sess.as_default():
        
        model = TextCNN(
            session=sess,
            num_classes=2,
            vocab_size=20,
            embedding_size=6,
            filter_sizes=[2, 3],
            num_filters=2,
        )
        
        sum = model.fit(
            X=rand_array, 
            y=rand_label,
            batch_size=10,
            num_epochs=5,
            checkpoint_every=1000,
            # keep_summaries=True,
            eval_X=eval_arr,
            eval_y=eval_label,
            )

Writing to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518


traing (batch_size=10, max_epochs=5) ::  22%|██▏       | 1076/5000 [00:02<00:10, 390.35it/s]

Saved model checkpoint to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518/checkpoints/model-999
2023-01-06T17:55:21.543900:999:: evaluation result: loss = 0.8319565653800964, acc = 0.49950000643730164


traing (batch_size=10, max_epochs=5) ::  42%|████▏     | 2083/5000 [00:04<00:06, 456.98it/s]

Saved model checkpoint to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518/checkpoints/model-1999
2023-01-06T17:55:23.646555:1999:: evaluation result: loss = 1.0630583763122559, acc = 0.49950000643730164


traing (batch_size=10, max_epochs=5) ::  61%|██████    | 3055/5000 [00:06<00:04, 395.18it/s]

Saved model checkpoint to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518/checkpoints/model-2999
2023-01-06T17:55:25.624709:2999:: evaluation result: loss = 1.3203043937683105, acc = 0.49950000643730164


traing (batch_size=10, max_epochs=5) ::  81%|████████  | 4056/5000 [00:08<00:02, 364.97it/s]

Saved model checkpoint to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518/checkpoints/model-3999
2023-01-06T17:55:27.741025:3999:: evaluation result: loss = 1.582446575164795, acc = 0.49950000643730164


traing (batch_size=10, max_epochs=5) :: 100%|██████████| 5000/5000 [00:11<00:00, 452.75it/s]

Saved model checkpoint to /Users/62-409/Documents/Log_Detection/final-project-log-anomaly/ipynb/runs/1673002518/checkpoints/model-4999
2023-01-06T17:55:30.015259:4999:: evaluation result: loss = 1.8297275304794312, acc = 0.49950000643730164



