/
db_log_parser.py
131 lines (110 loc) · 4.73 KB
/
db_log_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
from abc import ABC, abstractmethod
from calendar import timegm
from enum import Enum
import glob
import re
import time
NO_COL_FAMILY = 'DB_WIDE'
class DataSource(ABC):
class Type(Enum):
LOG = 1
DB_OPTIONS = 2
TIME_SERIES = 3
def __init__(self, type):
self.type = type
@abstractmethod
def check_and_trigger_conditions(self, conditions):
pass
class Log:
@staticmethod
def is_new_log(log_line):
# The assumption is that a new log will start with a date printed in
# the below regex format.
date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
return re.match(date_regex, log_line)
def __init__(self, log_line, column_families):
token_list = log_line.strip().split()
self.time = token_list[0]
self.context = token_list[1]
self.message = " ".join(token_list[2:])
self.column_family = None
# example log for 'default' column family:
# "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
# [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
for col_fam in column_families:
search_for_str = '\[' + col_fam + '\]'
if re.search(search_for_str, self.message):
self.column_family = col_fam
break
if not self.column_family:
self.column_family = NO_COL_FAMILY
def get_human_readable_time(self):
# example from a log line: '2018/07/25-11:25:45.782710'
return self.time
def get_column_family(self):
return self.column_family
def get_context(self):
return self.context
def get_message(self):
return self.message
def append_message(self, remaining_log):
self.message = self.message + '\n' + remaining_log.strip()
def get_timestamp(self):
# example: '2018/07/25-11:25:45.782710' will be converted to the GMT
# Unix timestamp 1532517945 (note: this method assumes that self.time
# is in GMT)
hr_time = self.time + 'GMT'
timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
return timestamp
def __repr__(self):
return (
'time: ' + self.time + '; context: ' + self.context +
'; col_fam: ' + self.column_family +
'; message: ' + self.message
)
class DatabaseLogs(DataSource):
def __init__(self, logs_path_prefix, column_families):
super().__init__(DataSource.Type.LOG)
self.logs_path_prefix = logs_path_prefix
self.column_families = column_families
def trigger_conditions_for_log(self, conditions, log):
# For a LogCondition object, trigger is:
# Dict[column_family_name, List[Log]]. This explains why the condition
# was triggered and for which column families.
for cond in conditions:
if re.search(cond.regex, log.get_message(), re.IGNORECASE):
trigger = cond.get_trigger()
if not trigger:
trigger = {}
if log.get_column_family() not in trigger:
trigger[log.get_column_family()] = []
trigger[log.get_column_family()].append(log)
cond.set_trigger(trigger)
def check_and_trigger_conditions(self, conditions):
for file_name in glob.glob(self.logs_path_prefix + '*'):
# TODO(poojam23): find a way to distinguish between log files
# - generated in the current experiment but are labeled 'old'
# because they LOGs exceeded the file size limit AND
# - generated in some previous experiment that are also labeled
# 'old' and were not deleted for some reason
if re.search('old', file_name, re.IGNORECASE):
continue
with open(file_name, 'r') as db_logs:
new_log = None
for line in db_logs:
if Log.is_new_log(line):
if new_log:
self.trigger_conditions_for_log(
conditions, new_log
)
new_log = Log(line, self.column_families)
else:
# To account for logs split into multiple lines
new_log.append_message(line)
# Check for the last log in the file.
if new_log:
self.trigger_conditions_for_log(conditions, new_log)