In [7]:
import datetime 
import os

rootPath = '../../dataset-10/'
loggingLevels = ['TRACE', 'DEBUG', 'INFO', 'WARN', 'ERROR']
    
def get_all_log_entries(file_prefix):
    all_log_entries = []
    for filename in os.listdir(rootPath):
        if filename.endswith('.log') and filename.startswith(file_prefix):
            filepath = rootPath + filename
            all_log_entries.extend(process_file(filepath))
    return all_log_entries
    
def process_file(filepath):
    print('Processing ' + filepath)
    f = open(filepath, "r+")
    allLines = f.readlines()
    parsedDates = []
    logEntries = []
    for line in allLines:
        process_entry(filepath,line,logEntries,parsedDates)
    f.close()
    return logEntries
    
def process_entry(filepath, line, logEntries, parsedDates):
    if any(line.startswith(x) for x in loggingLevels):
        splittedLine = line.split(None, 6)
        newEntry = dict(log_level = splittedLine[0])
        process_date(newEntry,parsedDates,splittedLine)
        process_thread_name(filepath,newEntry,splittedLine)
        process_logger_name(newEntry,splittedLine)
        newEntry['msg'] = splittedLine[6]
        logEntries.append(newEntry)
    else:
        append_line_to_last_msg(line,logEntries)
        
def append_line_to_last_msg(line, logEntries):
    if logEntries:
        logEntries[-1]['msg'] = logEntries[-1]['msg'] + line

def process_logger_name(newEntry, splittedLine):
    if splittedLine[4] != '':
        loggerName = splittedLine[4].split('.')[-1]
        if loggerName != '':
            newEntry['logger_name'] = loggerName
    
def process_thread_name(filepath, newEntry, splittedLine):
    if splittedLine[3] != '':
        if 'timepot' in filepath:
            threadNameWithUserId = splittedLine[3].split('][uid:')
            if len(threadNameWithUserId) > 0:
                threadName = threadNameWithUserId[0].replace('[', '')
                if threadName != '':
                    newEntry['thread_name'] = threadName
            if len(threadNameWithUserId) > 1: 
                userId = threadNameWithUserId[1].replace(']', '')
                if userId.isdigit():
                    newEntry['user_id'] = userId
        else:
            threadName = splittedLine[3][1:-1]
            if threadName != '':
                newEntry['thread_name'] = threadName
                
def process_date(newEntry, parsedDates, splittedLine):
    if splittedLine[1] != '' and splittedLine[2] != '':
        parsedDate = datetime.datetime.strptime(splittedLine[1] + splittedLine[2], '%Y-%m-%d%H:%M:%S.%f')
        parsedDates.append(parsedDate)
        newEntry['log_date'] = parsedDate