In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

In [79]:
def _assert_is_fit(func):
    def inner(self, *args, **kwargs):
        assert self._LogParser__is_fit, "model has not fit."
        return self.func(*args, **kwargs)
    return inner

class LogParser(object):
    
    def __init__(self, output_dir="./output") -> None:
        self.output_dir = output_dir
        self.__is_fit = False
     
    def generate_logformat_regex(self, logformat):
        """ Function to generate regular expression to split log messages
        """
        headers = []
        splitters = re.split(r'(<[^<>]+>)', logformat)
        regex = ''

        for k in range(len(splitters)):
            if k % 2 == 0:
                splitter = re.sub(' +', '\\\s+', splitters[k])
                regex += splitter
            else:
                header = splitters[k].strip('<').strip('>')
                regex += '(?P<%s>.*?)' % header  # such as (?P<Date>.*?)
                headers.append(header)

        regex = re.compile('^' + regex + '$')
        return headers, regex
    
    @_assert_is_fit
    def log_to_dataframe(self, log_file):
        """ Function to transform log file to dataframe 
        """
        log_messages = []
        with open(log_file, 'r') as fin:
            for line in tqdm(fin.readlines(), desc="log file reading :"):
                try:
                    match = self.regex_log_format.search(line.strip())
                    message = match.groupdict()
                    log_messages.append(message)
                except Exception as e:
                    print(f"\u001b[31mERROR-LINE: {line}")
            fin.close()
        
        logdf = pd.concat([
            pd.Series(range(1, len(log_messages) + 1), name="LineId"), 
            pd.DataFrame(log_messages)
            ], axis=1)
        
        return logdf
    
    @_assert_is_fit
    def parse(self, log_file):
        log_df = self.log_to_dataframe(log_file)
        return log_df
    
    def fit(self, log_format):
        self.log_format = log_format
        self.headers, self.regex_log_format = self.generate_logformat_regex(log_format)
        self.__is_fit = True
        return self