In [4]:
import pandas as pd
import re
import multiprocessing as mp
from pyparsing import (
    Word, Literal, Combine, Optional, Regex, Suppress, restOfLine, ParserElement, StringEnd, MatchFirst, alphas, alphanums
)

ParserElement.setDefaultWhitespaceChars(" ")
langle  = Suppress("<")
rangle  = Suppress(">")

timestamp = Word("0123456789:.", exact=12).setResultsName("timestamp")
log_level = Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ*").setResultsName("log_level")
tag = Word(alphas, alphanums + "_").setResultsName("tag")
pid_part = Word("0123456789:")
pid = Combine(pid_part + Literal(" ") + pid_part).setResultsName("PID")
first_block = langle + timestamp + log_level + tag + pid + rangle

optional_id_block = Optional(langle + Regex(r"\d+").setResultsName("optional_id") + rangle)
module = langle + Regex(r"[^>]+").setResultsName("module") + rangle
function = Optional(Suppress("[") + Regex(r"[^\]]+").setResultsName("function") + Suppress("]"))
message_basic = Optional(restOfLine.setResultsName("message"))

tail_alternative = optional_id_block + module + function + message_basic
message_multiline = Regex(r".*", re.DOTALL).setResultsName("message")

tail = MatchFirst([tail_alternative, message_multiline])
log_parser = first_block + tail + StringEnd()

def parse_line(line):
    if not re.match(r"^<\d{2}:\d{2}:\d{2}\.\d{3}", line):
        return {
            "timestamp": "", 
            "log_level": "",
            "tag": "",
            "PID": "",
            "optional_id": "", 
            "module": "", 
            "function": "", 
            "message": line.strip()
        }
    return log_parser.parseString(line.strip(), parseAll=True)

def add_to_dataframe(df, result): 
    df.loc[len(df)] = [
        result.get("timestamp", ""), 
        result.get("log_level", ""), 
        result.get("tag", ""), 
        result.get("PID", ""), 
        result.get("optional_id", ""), 
        result.get("module", ""), 
        result.get("function", ""), 
        result.get("message", "")]

def parse_worker(lines): 
    results = []
    for line in lines: 
        try: 
            results.append(parse_line(line))
        except Exception as e: 
            print(f"Error: {e} in line: {line}")
    
    return results

In [62]:
logfile_1 = "./IMS.log_20250409081706.log"
logfile_2 = "./IMS.log_20250409081724.log"
logfile_3 = "./IMS.log_20250409082908.log"
logfile_4 = "./IMS.log_20250409085500.log"

# dataframe = pd.DataFrame(columns=["Timestamp", "Level", "Tag", "PID", "Optional-ID", "Module", "Function", "Message"])
# i = 0

In [None]:
if __name__ == "__main__": 
    with open(logfile_1, "r") as log: 
        all_lines = log.readlines()

    n_workers = mp.cpu_count()
    chunks = [all_lines[i::n_workers] for i in range(n_workers)]

    with mp.Pool(n_workers) as pool: 
        results = pool.map(parse_worker, chunks)

    all_results = [item for sublist in results for item in sublist]

In [None]:
data = []
for result in all_results: 
    row = {
        "Timestamp": result.get("timestamp", ""), 
        "Log-Level": result.get("log_level", ""), 
        "Tag": result.get("tag", ""), 
        "PID": result.get("PID", ""), 
        "Optional-ID": result.get("optional_id", ""), 
        "Module": result.get("module", ""), 
        "Function": result.get("function", ""), 
        "Message": result.get("message", "")
    }
    data.append(row)

dataframe = pd.DataFrame(data)

In [5]:
log_line = "<08:16:53.307 **ERR** DB_ADAP_MGR 5034:5989 0:0>[operation_callback-DbCouchBaseSdk3.cpp-3010]:procedure = [asbcGetSyncData] cbType = [GET] errorCode = [0x12d] mappedErrorCode = [0xd] errorType = [0] errorString = [LCB_ERR_DOCUMENT_NOT_FOUND (301): Document is not found],KEY = [2477-0-614-2987-5-198115328-1743608335351178,asbcData]"

try: 
    result = parse_line(log_line); 
except Exception as e: 
    print(f"Error: {e} in line: {log_line}")