In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import re
import dateutil
import os
import datetime

notebook_path = os.path.abspath("tombstone-analysis.ipynb")


error_regex = re.compile(r"ERROR \[(?P<thread>.*)\] (?P<date>.{10} .{12}) *(?P<source_file>[^:]*):(?P<source_line>[0-9]*) - Scanned over (?P<tombstones>[0-9]*) tombstones during query '(?P<query>.*)' \(last scanned row partion key was \((?P<pk>.*)\)\); query aborted")
warn_regex = re.compile(r"WARN  \[(?P<thread>.*)\] (?P<date>.{10} .{12}) *(?P<source_file>[^:]*):(?P<source_line>[0-9]*) - Read (?P<live>[0-9]*) live rows and (?P<tombstones>[0-9]*) tombstone cells for query (?P<query>.*) \(see tombstone_warn_threshold\)")

system_logs = []
for root, dirs, files in os.walk(os.path.dirname(notebook_path), topdown=False):
    for name in files:
        if "system.log" in name:
            system_logs.append(os.path.join(root, name))

overflow_queries = {}
overflow_by_node = {}
tombstones_by_node = {}
warn_by_node = {}
start_time = datetime.datetime(2021, 5, 30)
c = 0
for log in system_logs:
    with open(log) as f:
        overflow = {}
        tombstones_by_date = {}
        warn = {}
        for line in f:
            warn_match = warn_regex.search(line)
            if warn_match:
                c += 1
                raw_date = warn_match.group('date')
                date = datetime.datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S,%f")
                if date < start_time:
                    continue
                current_td = datetime.timedelta(hours=date.hour, minutes=date.minute, seconds=date.second, microseconds=date.microsecond)
                # to minute resolution
                to_min = datetime.timedelta(minutes=round(current_td.total_seconds() / 60))
                date = datetime.datetime.combine(date, datetime.time(0)) + to_min
                tombstones = warn_match.group('tombstones')
                if date in tombstones_by_date:
                    tombstones_by_date[date] += float(tombstones) 
                else:
                    tombstones_by_date[date] = float(tombstones)
                query = warn_match.group('query')
                if date in warn:
                    warn[date] += 1
                else:
                    warn[date] = 1
            overflow_match = error_regex.search(line)
            if overflow_match:
                c += 1
                raw_date = overflow_match.group('date')
                date = datetime.datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S,%f")
                if date < start_time:
                    continue
                current_td = datetime.timedelta(hours=date.hour, minutes=date.minute, seconds=date.second, microseconds=date.microsecond)
                # to minute resolution
                to_min = datetime.timedelta(minutes=round(current_td.total_seconds() / 60))
                date = datetime.datetime.combine(date, datetime.time(0)) + to_min
                tombstones = overflow_match.group('tombstones')
                if date in tombstones_by_date:
                    tombstones_by_date[date] += float(tombstones) 
                else:
                    tombstones_by_date[date] = float(tombstones)
                query = overflow_match.group('query')
                if date in overflow:
                    overflow[date] += 1
                else:
                    overflow[date] = 1
                if query in overflow_queries:
                    overflow_queries[query] += 1
                else:
                    overflow_queries[query] = 1
        file_name = log.split("/")[-4]
        if len(overflow) > 0:
            overflow_by_node[file_name] = overflow
        if len(tombstones_by_date) > 0:
            tombstones_by_node[file_name] = tombstones_by_date
        if len(warn) > 0:
            warn_by_node[file_name] = warn

overflow_queries_pd = pd.DataFrame(list(overflow_queries.items()), columns=['queries','overflows'])
print(c)

In [None]:
plt.figure(figsize=(20,5))
plt.xlabel('tombstone overflows by time')
headers = None
labels = None
for node, overflow in overflow_by_node.items():
    if len(overflow.values()) == 0:
        continue
    overflow_pd = pd.DataFrame(list(overflow.items()), columns=['time','overflow'])
    ax = overflow_pd.set_index("time").overflow.plot(kind='line',label=node, grid=True)
    headers,labels = ax.get_legend_handles_labels()
if headers is not None and labels is not None:
    plt.legend(headers, labels, bbox_to_anchor=(1,1), loc="upper left")
    plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.xlabel('total tombstones scanned by minute')
pd.set_option('display.float_format', lambda x: '%,g' % x)
headers = None
labels = None
for node, ts in tombstones_by_node.items():
    if len(ts.values()) == 0:
        continue
    overflow_pd = pd.DataFrame(list(ts.items()), columns=['time','total_tombstone'])
    ax = overflow_pd.set_index("time").total_tombstone.plot(kind='line',label=node, grid=True)
    headers,labels = ax.get_legend_handles_labels()
    
if headers is not None and labels is not None:
    plt.legend(headers, labels, bbox_to_anchor=(1,1), loc="upper left")
    plt.show()

In [None]:
plt.figure(figsize=(20,5))
plt.xlabel('total queries with tombstone warnings by minute')
pd.set_option('display.float_format', lambda x: '%,g' % x)
headers = None
labels = None
for node, ts in warn_by_node.items():
    if len(ts.values()) == 0:
        continue
    overflow_pd = pd.DataFrame(list(ts.items()), columns=['time','queries'])
    ax = overflow_pd.set_index("time").queries.plot(kind='line',label=node, grid=True)
    headers,labels = ax.get_legend_handles_labels()
    
if headers is not None and labels is not None:
    plt.legend(headers, labels, bbox_to_anchor=(1,1), loc="upper left")
    plt.show()