Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 1176 lines (1000 sloc) 40.4 KB
#!/usr/bin/python
# -*- coding: utf8 -*-
"""
@author xuruiqi(fanfank@github.com)
@site https://github.com/fanfank/timecat
@date 20160106
@desc timecat is a command line tool for
saving disk I/O when you have to output
specific logs in between a time span.
It uses binary search to directly locate
the start position and end position, then
output content between the target positions.
"""
__version__ = "2.0.0"
import argparse
import copy
import datetime
import os
import re
import time
import traceback
import sys
SLEEP_DURATION = 0.001 # 1ms
MAX_LINE_LENGTH = 1024 * 1024 * 128 # 128MB
enable_color = False
enable_verbose = False
scan_line_num = 0
binary_loop_num = 0
DATETIME_FORMAT_LIST = [
# sequence is important
# tag the format with YEAR, MONTH, DAY ...
{
"desc": "e.g. 2016-01-02 20:13:14.666",
"regex": "(?P<YEAR>\d{4})\D(?P<MONTH>\d{2})\D(?P<DAY>\d{2})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})\.(?P<MICROSECOND>\d{1,3})",
"direct_compare": True,
},
{
"desc": "e.g. 2016-01-02 20:13:14",
"regex": "(?P<YEAR>\d{4})\D(?P<MONTH>\d{2})\D(?P<DAY>\d{2})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "syslog. e.g. Jan 2 20:13:14",
"regex": "(?P<MONTH>[a-zA-Z]{3})\D(?P<DAY>[ \d]{1}\d{1})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": False,
},
{
"desc" : "e.g. 2016/Jan/02 20:13:14.666",
"regex": "(?P<DAY>\d{2})\D(?P<MONTH>[a-zA-Z]{3})\D(?P<YEAR>\d{4})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})\.(?P<MICROSECOND>\d{1,3})",
"direct_compare": False,
},
{
"desc" : "e.g. 02-Jan-2016 20:13:14",
"regex": "(?P<DAY>\d{2})\D(?P<MONTH>[a-zA-Z]{3})\D(?P<YEAR>\d{4})\D(?P<HOUR>\d{2})\D(?P<MINUTE>\d{2})\D(?P<SECOND>\d{2})",
"direct_compare": False,
},
{
"desc" : "datetime without seperator. e.g. 20160102201314",
#"regex": "\d{14}",
"regex": "(?P<YEAR>\d{4})(?P<MONTH>\d{2})(?P<DAY>\d{2})(?P<HOUR>\d{2})(?P<MINUTE>\d{2})(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "timestamp in microseconds",
"regex": "(?P<MICROTS>\d{13})",
"direct_compare": True,
},
{
"desc" : "timestamp in seconds",
"regex": "(?P<TS>\d{10})",
"direct_compare": True,
},
{
"desc" : "only hour, minute and seconds",
"regex": "(?P<HOUR>\d{2}):(?P<MINUTE>\d{2}):(?P<SECOND>\d{2})",
"direct_compare": True,
},
{
"desc" : "only hour and minutes",
"regex": "(?P<HOUR>\d{2}):(?P<MINUTE>\d{2})",
"direct_compare": True,
},
]
MONTH_DICT = {
"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12",
"01" : "01", "02" : "02", "03" : "03", "04" : "04", "05" : "05",
"06" : "06", "07" : "07", "08" : "08", "09" : "09", "10" : "10",
"11" : "11", "12" : "12",
"1" : "01", "2" : "02", "3" : "03", "4" : "04", "5" : "05",
"6" : "06", "7" : "07", "8" : "08", "9" : "09",
" 1" : "01", " 2" : "02", " 3" : "03", " 4" : "04", " 5" : "05",
" 6" : "06", " 7" : "07", " 8" : "08", " 9" : "09",
}
DAY_DICT = {
"01": "01", "02": "02", "03": "03", "04": "04", "05": "05",
"06": "06", "07": "07", "08": "08", "09": "09",
" 1": "01", " 2": "02", " 3": "03", " 4": "04", " 5": "05",
" 6": "06", " 7": "07", " 8": "08", " 9": "09",
"10": "10", "11": "11", "12": "12", "13": "13", "14": "14",
"15": "15", "16": "16", "17": "17", "18": "18", "19": "19",
"20": "20", "21": "21", "22": "22", "23": "23", "24": "24",
"25": "25", "26": "26", "27": "27", "28": "28", "29": "29",
"30": "30", "31": "31",
}
CMP_OP_DICT = {
">" : lambda x, y: x > y,
"<" : lambda x, y: x < y,
"=" : lambda x, y: x == y,
"==": lambda x, y: x == y,
">=": lambda x, y: x >= y,
"<=": lambda x, y: x <= y,
"!=": lambda x, y: x != y,
}
def color(content, cl = "green"):
if not enable_color:
return content
if cl == "green":
return "\x1B[0;32;40m{}\x1B[0m".format(content)
elif cl == "red":
return "\x1B[0;31;40m{}\x1B[0m".format(content)
else:
return content
def ts2dt(ts, dt_format="%Y-%m-%d %H:%M:%S"):
return datetime.datetime.fromtimestamp(ts).strftime(dt_format)
def dt2ts(dt, dt_format="%Y-%m-%d %H:%M:%S"):
return time.mktime(time.strptime(dt, dt_format))
def init_parser(target_parser):
target_parser.add_argument(
"-s", "--start-datetime", dest = "start",
required = True,
help = "Which datetime to start(includsive). " \
"e.g. " \
"\"-s '2016-01-02 20:13:14'\", " \
"\"-s '2016/01/02 20:13:14'\", " \
"\"-s '2016-01-02T12:13:14'\", " \
"\"-s '2016-01-02T12:13:14.000'\", " \
"\"-s '02/Jan/2016:20:13:14'\", " \
"\"-s '02-Jan-2016 20:13:14'\", " \
"\"-s '02 Jan 2016 20:13:14'\", " \
"\"-s 'Jan 2 20:13:14'\", " \
"\"-s '20160102201315'\", " \
"\"-s '1451736794'\", " \
"\"-s '20:13'\", etc. We will exhaust " \
"our effort to cover regular datetime formats. " \
"The format of -s and -e params do not need to " \
"be consistent with the datetime format in the file.")
target_parser.add_argument(
"-e", "--end-datetime", dest = "end",
default = None,
help = "Stop after reaching this datetime(excludsive). " \
"Same format as \"-s\". If not set, means output " \
"till the end of file.")
target_parser.add_argument(
"-d", "--date", dest = "date",
default = None,
help = "This is an optional argument. With \"-d\", the " \
"following two statements are essentially the same: " \
"\"timecat -s '2016-01-02 20:13:14' -e '2016-01-02 20:14:13' ...\" " \
"and \"timecat -d '2016-01-02' -s '20:13:14' -e '20:14:13' ...\"." )
#target_parser.add_argument(
# #DEPRECATED
# "-r", "--regex-format", dest = "regex_format",
# default = None,
# help = "If timecat failes to detect datetime format in " \
# "your log file, you can specify the regex pattern " \
# "that can find your datetime within each log line. " \
# "e.g. I have format \"2016:01:01-20-13-14\", and " \
# "timecat does not recognize this datetime format, " \
# "then I can specify " \
# "\"-r '\d{4}:\d{2}:\d{2}-\d{2}-\d{2}-\d{2}'\".")
target_parser.add_argument(
"--lpms", dest="lpms",
type=int, default=-1,
help="Reading speed control, "\
"after how many lines read should the program " \
"sleep for 1ms, default to -1, meaning no limit.")
target_parser.add_argument(
"-v", "--verbose", dest = "enable_verbose",
action = "store_true",
default = False,
help = "print additional information")
target_parser.add_argument(
"--color", dest = "enable_color",
action = "store_true", default = False,
help = "Whether to enable colorized output")
target_parser.add_argument(
"file", nargs = "+",
help = "files to be timecat.")
def dtcmp(lhs, rhs, format_info, cmp_op):
"""compare datetime
Params:
lhs # left hand side
rhs # right hand side
format_info = {
"regex" # the regular expression
"parser" # the compiled regular expression
"direct_compare" # indicates whether lhs and
# rhs can be compared directly
}
cmp_op # ">", "<", "==", ">=", "<=" or "!="
Authors: xuruiqi
"""
global MONTH_DICT
global CMP_OP_DICT
cmpf = CMP_OP_DICT[cmp_op]
if cmp_op in ["=", "!="] or format_info["direct_compare"]:
return cmpf(lhs, rhs)
else:
lhs_match = format_info["parser"].search(lhs)
rhs_match = format_info["parser"].search(rhs)
lhs_groupdict = lhs_match.groupdict()
rhs_groupdict = rhs_match.groupdict()
dtcomponent_list = ["YEAR", "MONTH", "DAY", "HOUR",
"MINUTE", "SECOND", "MILISECOND", "MICROSECOND"]
for dtcomponent in dtcomponent_list:
lhs_component = lhs_groupdict.get(dtcomponent, None)
rhs_component = rhs_groupdict.get(dtcomponent, None)
if dtcomponent == "MONTH" and lhs_component:
lhs_component = MONTH_DICT[lhs_component]
rhs_component = MONTH_DICT[rhs_component]
if lhs_component != rhs_component:
return cmpf(lhs_component, rhs_component)
if cmp_op in [">", "<"]:
return False
else:
return True
def get_bi_cmp_func(file_format_info, param_format_info):
"""return compare function(s) according to file_format_info
Authors: xuruiqi
"""
def ge(lhs, rhs):
return dtcmp(lhs, rhs, file_format_info, ">=")
def le(lhs, rhs):
return dtcmp(lhs, rhs, file_format_info, "<=")
if file_format_info["order"] == "ascending":
return ge
else:
return le
def get_bi_cmp_func2(file_format_info, cmp_pattern_format_info, cmp_pattern):
"""a refined function of get_bi_cmp_func since timecat 2.x
Authors: xuruiqi
"""
global DAY_DICT
global MONTH_DICT
# unify the datetime formats from file and user input
file_sample_match = file_format_info["parser"].search(
file_format_info["sample"])
cmp_pattern_match = cmp_pattern_format_info["parser"].search(
cmp_pattern)
def year_adapt_func(match):
#TODO(xuruiqi) have to consider global time
mgd = match.groupdict()
return mgd.get("YEAR", "1970") + \
str(MONTH_DICT[mgd.get("MONTH", "01")]) + \
str(DAY_DICT[mgd.get("DAY", "01")]) + \
mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
def month_adapt_func(match):
mgd = match.groupdict()
return str(MONTH_DICT[mgd.get("MONTH", "01")]) + \
str(DAY_DICT[mgd.get("DAY", "01")]) + \
mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
def hour_adapt_func(match):
mgd = match.groupdict()
return mgd.get("HOUR", "00") + \
mgd.get("MINUTE", "00") + \
mgd.get("SECOND", "00") + \
"%03d" % int(mgd.get("MICROSECOND", "000"))
adapt_func_dict = {
"MICROTS": lambda x: x,
"TS" : lambda x: x + "000",
"YEAR" : year_adapt_func,
"MONTH" : month_adapt_func,
"HOUR" : hour_adapt_func,
}
pattern_type_conversion_dict = {
"MICROTS": "MICROTS",
"TS" : "MICROTS",
"YEAR" : "YEAR",
"MONTH" : "MONTH",
"HOUR" : "HOUR",
}
fsm_groupdict = file_sample_match.groupdict()
cpm_groupdict = cmp_pattern_match.groupdict()
final_pattern_type = None
file_pattern_adapt_func = None
for key in ["MICROTS", "TS", "YEAR", "MONTH", "HOUR"]:
if fsm_groupdict.get(key, None):
final_pattern_type = pattern_type_conversion_dict[key]
file_pattern_adapt_func = adapt_func_dict[key]
break
if final_pattern_type is None:
raise Exception("Invalid datetime format in file")
# Convert cmp_pattern to final pattern format
# Firstly, convert to MICROTS type
if cpm_groupdict.get("MICROTS", None):
pass
elif cpm_groupdict.get("TS", None):
cmp_pattern = cmp_pattern + "000"
elif cpm_groupdict.get("YEAR", None):
year_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif cpm_groupdict.get("MONTH", None):
# this situation is a little bit compilicated
# When file is in a TS/MICROTS/YEAR format
# we have to guess the year of the cmp pattern
if final_pattern_type == "MONTH" or final_pattern_type == "HOUR":
month_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif final_pattern_type == "YEAR" or final_pattern_type == "MICROTS":
month_pattern = adapt_func_dict["MONTH"](cmp_pattern_match)
# extract year
if final_pattern_type == "YEAR":
year = file_pattern_adapt_func(file_sample_match)[0:4]
elif final_pattern_type == "MICROTS":
year_month_day = \
ts2dt(file_sample_match.group(0)[:10], "%Y")
cmp_pattern = \
str( \
dt2ts(year + month_pattern[:-3], \
"%Y%m%d%H%M%S")) + \
month_pattern[-3:]
elif cpm_groupdict.get("HOUR", None):
# this situation is a little bit compilicated
# When file is in a TS/MICROTS/YEAR format
# we have to guess the year, month and day
# of the cmp pattern
if final_pattern_type == "HOUR":
year_pattern = adapt_func_dict["YEAR"](cmp_pattern_match)
cmp_pattern = str(dt2ts(year_pattern[:-3], "%Y%m%d%H%M%S")) + \
year_pattern[-3:]
elif final_pattern_type == "YEAR" or final_pattern_type == "MICROTS":
hour_pattern = adapt_func_dict["HOUR"](cmp_pattern_match)
# extract year month day
if final_pattern_type == "YEAR":
year_month_day = file_pattern_adapt_func(file_sample_match)[0:8]
elif final_pattern_type == "MICROTS":
year_month_day = \
ts2dt(file_sample_match.group(0)[:10], "%Y%m%d")
cmp_pattern = \
str( \
dt2ts(year_month_day + hour_pattern[:-3], \
"%Y%m%d%H%M%S")) + \
hour_pattern[-3:]
#TODO(xuruiqi) Latter we have to consider global time
# Convert from MICROTS to final_pattern_type
if final_pattern_type == "MICROTS":
# do nothing
pass
else:
dtformat = "%Y%m%d%H%M%S"
if final_pattern_type == "HOUR":
final_pattern_type = "%H%M%S"
elif final_pattern_type == "MONTH":
final_pattern_type = "%m%d%H%M%S"
cmp_pattern = ts2dt(int(cmp_pattern[0:10]), dtformat) + \
cmp_pattern[-3:]
# Finish unifying the datetime format from user input and file
# Return corresponding compare function
def ge(line_match):
adapt_pattern = file_pattern_adapt_func(line_match)
return adapt_pattern >= cmp_pattern
def le(line_match):
adapt_pattern = file_pattern_adapt_func(line_match)
return adapt_pattern <= cmp_pattern
if file_format_info["order"] == "ascending":
return ge
else:
return le
def at_line_head(f):
"""judge if f's reading pointer is at the head of a line
Authors: xuruiqi
"""
if f.tell() == 0:
return True
else:
f.seek(f.tell() - 1)
return f.read(1) == "\n"
def locate_current_line(f, ed, st_bound,
backward_step_hint = 1024 * 4):
"""return the start position of line of ed. this triggers
a backward search of line, and the search postion must
stay in [st_bound, ed], inclusively.
if no corresponding line is found, then st_bound is returned
Authors: xuruiqi
TODO(xuruiqi) set a limit to the max length of a line
"""
global scan_line_num
f.seek(0, os.SEEK_END)
eof_pos = f.tell()
if ed == eof_pos:
return ed
# old_pos means after which(inclusive) all the data has
# been read and no need to read again in this function
old_pos = min(ed + 1, eof_pos)
backward_step = backward_step_hint
while old_pos != st_bound:
new_pos = old_pos - backward_step
if new_pos < st_bound:
new_pos = st_bound
f.seek(new_pos)
# collect all lines in [new_pos, old_pos)
lines = []
while f.tell() < old_pos:
line = f.readline(old_pos - f.tell())
lines.append(line)
scan_line_num += 1
# if only one line(maybe not a complete line) is read
# check if the first character of this "line" is really
# at the head of line
if len(lines) == 1:
f.seek(0 - len(lines[-1]), os.SEEK_CUR)
if at_line_head(f):
return f.tell()
# multi lines read. directly set the position to the
# head of last line
else:
f.seek(0 - len(lines[-1]), os.SEEK_CUR)
return f.tell()
# not done, continue to look for the head of a line
old_pos = new_pos
# no line found, return st_bound directly
f.seek(st_bound)
return st_bound
def locate_next_line(f, st, ed_bound, forward_step = 1024 * 4):
"""in range [st, ed_bound), find the position of the head
of a line, starting from st. If no such position found,
ed_bound is returned
Authors: xuruiqi
TODO(xuruiqi) add a limit to the max length of a line
"""
global scan_line_num
f.seek(st)
# check if st is already at a line head
if not at_line_head(f):
# read until new line character or ed_bound is reached
step = min(forward_step, ed_bound - f.tell())
while f.tell() != ed_bound and f.readline(step):
scan_line_num += 1
if at_line_head(f):
return f.tell()
return f.tell()
def forward_match(f, st, ed, regprog, ed_inclusive = True,
forward_step_hint = 1024 * 4):
"""read until regprog matches a line or excceed ed
if a match is found, then set f's reading pointer to
the corresponding line head, else to ed
return the match obj, the last matched line, and the
position of the head of the matched line
Authors: xuruiqi
Return: match # the matched object
line # the last read line
(may not be a complete line)
f.tell() # the head of the line
"""
global MAX_LINE_LENGTH
global scan_line_num
if ed_inclusive:
f.seek(ed)
f.readline()
ed = f.tell()
scan_line_num += 1
f.seek(st)
match = None
line = None
while f.tell() < ed:
scan_line_num += 1
# try to read a complete line
line = ""
while len(line) < MAX_LINE_LENGTH and f.tell() < ed and \
(len(line) == 0 or line[-1] != "\n"):
line += f.readline(
min(ed - f.tell(), forward_step_hint))
if len(line) > MAX_LINE_LENGTH:
sys.stderr.write(
color(
"line too long, excceeds {} bytes\n".format(
MAX_LINE_LENGTH),
cl = "red"))
return None, None, None
match = regprog.search(line)
if match:
f.seek(0 - len(line), os.SEEK_CUR)
return match, line, f.tell()
f.seek(ed)
return match, line, f.tell()
def backward_match(f, ed, st, regprog, backward_step_hint = 1024 * 4):
"""backward read until regprog matches a line or excceed st
if the matched line is found, then locate f's reading pointer
to the corresponding head of the line, else locate to st
return the matched obj, the last read line and the head
position of the line
NOTE: ed is not read
Authors: xuruiqi
Return: match # the matched object
line # last line read(maybe not a complete line)
f.tell() # the head position of the line
TODO(xuruiqi) add a limit to max length of the line
"""
global scan_line_num
f.seek(ed)
if f.tell() < st:
return None, None, None
match = None
line = None
old_pos = f.tell()
# cache backward read content in case failing to read a whole
# line during a loop round
last_buffer = ""
backward_step = backward_step_hint
while (not match) and (old_pos > st):
new_pos = old_pos - backward_step
if new_pos < st:
new_pos = st
f.seek(new_pos)
lines = []
while f.tell() < old_pos:
scan_line_num += 1
line = f.readline(old_pos - f.tell())
lines.append(line)
f.seek(new_pos)
valid_start_index = 0
if len(lines) == 1:
# when len(lines) == 1, there may be the following
# possibilities:
# 1. no newline character is read
# 1.1. the head of the read data is the head of
# a line
# 1.2. the head of the read data is not the
# head of a line
# 2. newline character is read
# 2.1. the head of the read data is the head of
# a line
# 2.2. the head of the read data is not the
# head of a line
# we may find that 1.1 is the same as 2.1, in this
# case we have to concatenate the read content and
# the last_buffer, then return if the line is valid
#
# And we may also find that 1.2 is the same as 2.2,
# concatenate the read content and the last_buffer,
# then update last_buffer with the concatenated data
if at_line_head(f):
lines[0] = lines[0] + last_buffer
last_buffer = ""
else:
last_buffer = lines[0] + last_buffer
lines = []
else:
# when len(lines) != 1, there may be the following
# possibilities:
# 1. lines[0] is not a complete line
# 2. lines[0] is a complete line
# we can judge by checking if the first character
# of lines[0] is at line head
lines[-1] = lines[-1] + last_buffer
last_buffer = ""
if not at_line_head(f) and new_pos != st:
# lines[0] is not a complete line
# nor does lines[0][0] at position st
last_buffer = lines[0]
valid_start_index = 1
if new_pos == st and len(last_buffer) > 0:
# new_pos == st means the loop will end
# after this round, so we have to handle
# data in last_buffer
lines.append(last_buffer)
total_lines_length = 0
for line in lines:
total_lines_length += len(line)
# handle data from this round
cur_lines_length = 0
for index in reversed(range(valid_start_index, len(lines))):
line = lines[index]
cur_lines_length += len(line)
match = regprog.search(line)
if match:
# locate f's reading pointer
f.seek(
total_lines_length \
- cur_lines_length \
+ new_pos)
return match, line, f.tell()
# update old_pos
old_pos = new_pos
f.seek(st)
return None, line, f.tell()
def binary_seek_pos(f, st, ed, cmp_pattern, param_format_info,
file_format_info):
"""use binary search to find the first line that is bigger/smaller
than the cmp_pattern when file is in ascending/descending order
Authors: xuruiqi
"""
global scan_line_num
global binary_loop_num
# record the valid read pointer range
st_bound = st
ed_bound = ed
# locate st to the start of the next line
# unless st is already at the start of the current
# line
#st = locate_next_line(f, st, ed_bound)
# locate ed to the start of the current line
# unless ed is at eof
ed = locate_current_line(f, ed, st_bound)
# get compare function according to file_format_info
# ae => after or equal to
# when file is in ascending order, ae == ">="
# when file is in descending order, ae == "<="
# NOTE: actually you can comprehense ae in this way:
# it indicates that the left hand side parameter
# stands after or has the same value as the right
# hand side parameter
ae = get_bi_cmp_func2(file_format_info, param_format_info, cmp_pattern)
#ae = get_bi_cmp_func(file_format_info, param_format_info)
# start doing binary search
regprog = file_format_info["parser"]
while st < ed:
binary_loop_num += 1
mid = st + (ed - st) / 2
# read until regprog matches the line
f.seek(mid)
match, line, res_pos = forward_match(f, mid, ed, regprog)
if match:
# modify group(0) compare if match pattern is after
# or equal to the cmp_pattern
if ae(match):
if res_pos == ed:
# in case this causes a dead loop, backward
# search a line and compare
# NOTE: if we do not handle res_pos == ed
# situation, we may encouter a dead loop,
# say only 2 lines left, the 1st line has
# 10 bytes, the 2nd line has 100 bytes,
# then "mid" will always locate within the
# 2nd line, if pattern in the 2nd line
# accidently after or equal to the
# cmp_pattern, a dead loop occurs, because
# "ed" will not change in the next round
match, line, back_res_pos = backward_match(
f, mid, st, regprog)
if not match or back_res_pos == res_pos:
# this means only one line left, and it
# covers positions st and ed, just return
# the res_pos
return res_pos
elif back_res_pos == st:
# this means only two lines left, and
# they cover positions st and ed. just
# compare and decide which to return
if ae(match):
return st
else:
return res_pos
else:
if ae(match):
ed = back_res_pos
else:
st = back_res_pos
else:
ed = res_pos
else:
if res_pos == st:
# this means st and ed must be covered
# by the same line, just return st/res_pos
return st
st = res_pos
else:
# forward search does not find any valid lines
# try backward search
match, line, res_pos = backward_match(f, mid, st, regprog)
if not match:
# the whole file does not contain any valid line
return None
# found one valid line, compare with cmp_pattern
if ae(match):
ed = res_pos
else:
# this line and the lines follow, until ed,
# all locate before the target cmp_pattern,
# thus return ed directly
return ed
return None if st > ed else ed
def detect_datetime_format(pattern, param_format = None):
"""detect datetime format of a pattern
Authors: xuruiqi
Returns:
{
"regex": , # string type of regex format
"direct_compare": , # whether it's able to
# directly compare two time pattern
"parser": , # compiled regex object
"sample": , # the pattern itself
"is_global_time": , # indicate whether it is UTC time
}
"""
global DATETIME_FORMAT_LIST
for datetime_format in DATETIME_FORMAT_LIST:
match = re.search(datetime_format["regex"], pattern)
if match:
datetime_format_info = copy.deepcopy(datetime_format)
datetime_format_info["parser"] = re.compile(
datetime_format["regex"])
if datetime_format.get("direct_compare", None) is None:
datetime_format_info["direct_compare"] = True
datetime_format_info["sample"] = pattern
#datetime_format_info["is_global_time"] = True \
# if pattern.find("T") >= 0 \
# else False
if pattern.find("T") >= 0:
datetime_format_info["is_global_time"] = True
return datetime_format_info
return None
def detect_file_format(f, start, end, param_format_info):
"""detect whether file is arranged in ascending order or
descending order, and detect the datetime format of file
Authors: xuruiqi
Returns:
{
"regex": , # string type of regex format
"direct_compare": , # whether it's able to
# directly compare two time pattern
"parser": , # compiled regex object
"order": , # the value is 'ascending' or 'descending',
indicating the order of the file
"sample": , # the pattern itself
}
"""
global scan_line_num
original_seek_pos = f.tell()
file_format_info = {
"regex" : None,
"direct_compare": None,
"parser": None,
"order" : None,
"sample": None,
}
# sample the file to determine datetime format in file
try:
MAX_READLINE_NUM = 100
COUNT_THRESHOLD = 3
f.seek(0)
current_format_info = None
current_count = 0
for i in xrange(0, MAX_READLINE_NUM):
scan_line_num += 1
line = f.readline()
if not line:
break
tmp_format_info = detect_datetime_format(line)
if tmp_format_info is None:
continue
if current_format_info is None or \
tmp_format_info["regex"] == current_format_info["regex"]:
current_count += 1
if current_format_info is None:
current_format_info = tmp_format_info
else:
current_count -= 1
if current_count <= 0:
current_count = 1
current_format_info = tmp_format_info
if current_count >= COUNT_THRESHOLD:
break
# can not return None, because some file may have very few lines
#if current_count < COUNT_THRESHOLD:
# return None
except Exception as ex:
raise ex
finally:
f.seek(original_seek_pos)
file_format_info["direct_compare"] = current_format_info["direct_compare"]
file_format_info["regex"] = current_format_info["regex"]
file_format_info["parser"] = current_format_info["parser"]
file_format_info["sample"] = current_format_info["sample"]
# if both start and end are provied, we assume the file order
# is the same as start -> end
if end and dtcmp(start, end, param_format_info, "<"):
file_format_info["order"] = "ascending"
return file_format_info
elif end and dtcmp(start, end, param_format_info, ">"):
file_format_info["order"] = "descending"
return file_format_info
elif end and start == end:
return None
# sample the file to determine file order
try:
MAX_READLINE_NUM = 1000
first_datetime = None
second_datetime = None
# read from head
f.seek(0)
for i in xrange(0, MAX_READLINE_NUM):
scan_line_num += 1
line = f.readline()
if not line:
return None
match = file_format_info["parser"].search(line)
if match:
first_datetime = match.group(0)
break
if not first_datetime:
return None
# read from tail
first_pos = f.tell()
f.seek(0, os.SEEK_END)
last_second_pos = f.tell()
for i in xrange(0, MAX_READLINE_NUM):
if last_second_pos <= first_pos:
return None
# locate the begining of the lines reversly
last_second_pos = locate_current_line(
f,
last_second_pos - 1,
first_pos - 1)
f.seek(last_second_pos)
scan_line_num += 1
line = f.readline()
# no need to judge if reaches eof, which is impossible
match = file_format_info["parser"].search(line)
if match:
second_datetime = match.group(0)
break
if not second_datetime:
return None
# compare first_datetime and second_datetime
if dtcmp(first_datetime, second_datetime, file_format_info, "<"):
file_format_info["order"] = "ascending"
return file_format_info
elif dtcmp(first_datetime, second_datetime, file_format_info, ">"):
file_format_info["order"] = "descending"
return file_format_info
else:
return None
except Exception as ex:
raise ex
finally:
f.seek(original_seek_pos)
def handle_read(cmd_namespace):
"""
Authors: xuruiqi
"""
global SLEEP_DURATION
filepath_list = getattr(cmd_namespace, "file")
lpms = getattr(cmd_namespace, "lpms")
start = getattr(cmd_namespace, "start")
end = getattr(cmd_namespace, "end")
param_format_info = detect_datetime_format(start)
if not param_format_info:
return 1
for filepath in filepath_list:
try:
if not os.path.isfile(filepath):
sys.stderr.write(
color(
"Error: path [{}] does not exist.\n\n".format(filepath),
cl = "red"))
continue
global binary_loop_num
global scan_line_num
binary_loop_num = 0
scan_line_num = 0
with open(filepath, "r") as f:
# detect order of lines, then select a compare function
# this is necessary because datetime in files may be in
# ascending or descending order
file_format_info = detect_file_format(
f, start, end, param_format_info)
if not file_format_info:
sys.stderr.write(
color(
"Error: can not detect datetime format/order in " \
"file [{}].\n\n".format(filepath),
cl = "red"))
continue
# get current file size
start_pos = 0
f.seek(0, os.SEEK_END)
end_pos = f.tell()
# get start read position of the file
start_read_pos = binary_seek_pos(
f, start_pos, end_pos,
start,
param_format_info,
file_format_info)
if start_read_pos is None:
sys.stderr.write(
color(
"Error: no matching start line for reading.\n\n",
cl = "red"))
continue
# get end read position of the file
if not end:
end_read_pos = end_pos
else:
end_read_pos = binary_seek_pos(
f, start_read_pos, end_pos,
end,
param_format_info,
file_format_info)
if end_read_pos is None:
sys.stderr.write(
color(
"Error: no matching end line for reading." \
"\n\n",
cl = "red"))
continue
# print verbose message
global enable_verbose
if enable_verbose:
sys.stdout.write(
color(
"[{}] after {} binary search loop, " \
"and scaning {} lines, start and end " \
"positions located\n".format(
filepath,
binary_loop_num,
scan_line_num)))
sys.stdout.write(
color(
"start read pos:[{}], end read pos:[{}].\n".format(
start_read_pos,
end_read_pos)))
sys.stdout.write(color("lpms:[{}]\n".format(lpms)))
sys.stdout.write(color("---------------\n"))
# start reading
f.seek(start_read_pos)
line_count = 0
while f.tell() < end_read_pos:
line = f.readline()
if f.tell() > end_read_pos:
break
sys.stdout.write(line)
# reading speed control
line_count += 1
if lpms > 0 and \
line_count >= lpms:
line_count = 0
time.sleep(SLEEP_DURATION)
if enable_verbose:
sys.stdout.write(color("---------------\n"))
sys.stdout.write(color("[{}] finish read. " \
"current f.tell() value:[{}], " \
"end read pos:[{}]. exit.\n\n".format(
filepath,
f.tell(),
end_read_pos)))
except Exception as ex:
sys.stderr.write(color("Exception in file [%s]: %r\n" % (filepath, ex), "red"))
return 0
def main():
"""
Authors: xuruiqi
"""
global enable_color
global enable_verbose
try:
# parse command line options
parser = argparse.ArgumentParser(
description = "Usage: " \
"timecat -s '2016-01-02 20:13:14' -e '2016-01-02 20:14:13'" \
" LOGFILE1.log LOGFILE2.log ... " \
"-s and -e params can be in any datetime format, " \
"it is not necessary to be consistent with " \
"datetime format in file.")
init_parser(parser)
cmd_namespace = parser.parse_args()
enable_color = getattr(cmd_namespace, "enable_color")
enable_verbose = getattr(cmd_namespace, "enable_verbose")
if getattr(cmd_namespace, "date"):
setattr(cmd_namespace, "start",
"{} {}".format(
getattr(cmd_namespace, "date"),
getattr(cmd_namespace, "start")))
if getattr(cmd_namespace, "end"):
setattr(cmd_namespace, "end",
"{} {}".format(
getattr(cmd_namespace, "date"),
getattr(cmd_namespace, "end")))
return handle_read(cmd_namespace)
except Exception as ex:
errmsg = "Unknown exception=[{}], traceback=[{}]".format(
repr(ex),
repr(traceback.format_exception(*sys.exc_info())))
sys.stderr.write(color(errmsg + "\n", cl = "red"))
raise ex
if __name__ == "__main__":
try:
sys.exit(main())
except Exception as ex:
sys.exit(-255)