In [None]:
import os
import re
import datetime
import functools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib.dates as mdates
import matplotlib.patches as patches
import matplotlib.lines as lines

In [None]:
tqdm.pandas()

In [None]:
log_re = re.compile(r"(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2}\,\d{3}) (\[\w+\]) (\w+): (.*)")

In [None]:
def read_line(line,source,fn):
    match = log_re.match(line)
    if match is None:
        content = line
        level = None
        date = None
    else:
        date = datetime.datetime.fromisoformat(match.group(1).replace(",","."))
        level = match.group(2)
        content = match.group(3)
    return {"date": date, "level": level, "content":content,"source":source,"file":fn}

def load_logs(crawl_id=""):
    folder = f"../../logs/{crawl_id}"
    for root,_,files in os.walk(folder):
        print(root)
        crawl_id = root[len("../../logs/"):]
        for name in tqdm(files):
            fn = f"{root}/{name}"
            if not os.path.isfile(fn): continue
            with open(fn) as f:
                for log_data in map(lambda line: read_line(line,crawl_id,fn),f.readlines()):
                    yield log_data
                
logs = pd.DataFrame(load_logs())

In [None]:
sns.histplot(x="date",data=logs,hue="source",multiple="stack")
plt.xticks(rotation=45)

In [None]:
cutoffs = list(map(lambda datestr: datetime.datetime.strptime(datestr,"%Y-%m-%d"),["2021-04-15","2021-02-01","2020-12-20","2020-10-15","2020-09-01","2020-03-30"]))
crawl_ids = ["crawl_13", "crawl_12", "crawl_11", "crawl_10", "crawl_9", "EYG"]

In [None]:
fn_re = re.compile(".*/([a-zA-Z_]+)_(\d{8}-\d{4}).txt")
def get_crawl_id(crawl_id, fn):
    if crawl_id != "":
        return crawl_id
    
    match = fn_re.match(fn)
    if match is None or match.group(1) != "yelp_review_crawl": #only want review crawls, not business crawls
        print(f"No match for filename: {fn}")
        return None
    date = datetime.datetime.strptime(match.group(2),"%Y%m%d-%H%M")
    for cutoff, crawl_id in zip(cutoffs, crawl_ids):
        if date > cutoff:
            return crawl_id
    raise Exception(f"{date},{cutoff}")
    

def read_line(line,source,fn):
    match = log_re.match(line)
    if match is None:
        content = line
        level = None
        date = None
    else:
        date = datetime.datetime.fromisoformat(match.group(1).replace(",","."))
        level = match.group(2)
        content = match.group(3)
    return {"date": date, "level": level, "content":content,"source":source,"file":fn}

def load_logs(crawl_id=""):
    folder = f"../../logs/{crawl_id}"
    for root,_,files in os.walk(folder):
        crawl_id = root[len("../../logs/"):]
        print(root,crawl_id)
        for name in tqdm(files):
            fn = f"{root}/{name}"
            file_crawl_id=get_crawl_id(crawl_id,fn)
            if not os.path.isfile(fn): continue
            with open(fn) as f:
                for log_data in map(lambda line: read_line(line,file_crawl_id,fn),f.readlines()):
                    yield log_data
                
logs = pd.DataFrame(load_logs())

In [None]:
logs = logs.sort_values("date")

In [None]:
log_sample = logs.sample(1000000).sort_values("date")

In [None]:
ax = sns.histplot(x="date",data=log_sample,hue="source",multiple="stack",palette="pastel")
plt.xticks(rotation=45)
sns.move_legend(ax, bbox_to_anchor=(1.04,1), loc="upper left")

In [None]:
ax = sns.histplot(x="date",data=logs,hue="source",multiple="stack",palette="pastel")
plt.xticks(rotation=45)
sns.move_legend(ax, bbox_to_anchor=(1.04,1), loc="upper left")

In [None]:
times = logs.groupby("source").apply(lambda df: pd.Series([df.date.min(),df.date.max()],index=["start","end"])).sort_values("start")

In [None]:
times

In [None]:
name_map = {"EYG": "EYG",
            "crawl_x0":"UDIS-1",
            "crawl_x1":"UDIS-2",
            "crawl_x2":"UDIS-3",
            "crawl_x3":"UDIS-4",
            "crawl_9":"CHI-0",
            "crawl_10":"CHI-1",
            "crawl_11":"CHI-2",
            "crawl_12":"CHI-3",
            "crawl_13":"CHI-4",
            "crawl_14":"CHI-5",
            "crawl_15":"CHI-6",
            "crawl_16":"CHI-7",
            "crawl_17":"CHI-8",
           }

In [None]:
times = times.drop(["crawl_9","crawl_18"])
times

In [None]:
times = times.reset_index()
times["source"] = times["source"].replace(name_map)
times = times.set_index("source")

In [None]:
times

In [None]:
#Modified from https://stackoverflow.com/a/31163913/582136

top_palette = sns.color_palette("pastel")
bottom_palette = sns.color_palette("tab10")

fig = plt.figure()
ax = fig.add_subplot(111)

# Create rectangle x coordinates
startTime = times.start.min()
endTime = times.end.max()

# convert to matplotlib date representation
start = mdates.date2num(startTime)
end = mdates.date2num(endTime)
buffer = 30

# Plot rectangle
bottom_ct = 0
top_ct = 0
c_handles = []
c_labels = []
o_handles = []
o_labels = []
for name, row in times.iterrows():
    row_start = mdates.date2num(row.start)
    row_end = mdates.date2num(row.end)
    if 'UDIS' in name or name == "EYG":
        y_start = 1.1
        y_end = 1.9
        color = top_palette[top_ct]
        top_ct += 1
        handles = o_handles
        labels = o_labels
    else:
        y_start = 0.1
        y_end = 0.9
        color = bottom_palette[bottom_ct]
        bottom_ct += 1
        handles = c_handles
        labels = c_labels
    rect = patches.Rectangle((row_start, y_start), row_end - row_start, 0.8, color=color)
    patch = ax.add_patch(rect)
    handles.append(patch)
    labels.append(name)
    

# assign date locator / formatter to the x-axis to get proper labels
locator = mdates.AutoDateLocator(minticks=12)
formatter = mdates.AutoDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

# set the limits
plt.xlim([start-buffer, end+buffer])
plt.ylim([0, 2])
plt.xticks(rotation=45)

handles = o_handles  + c_handles
labels = o_labels  + c_labels
plt.legend(handles,labels,bbox_to_anchor=(0.5,1), loc="lower center",ncol=3)

ax.axes.get_yaxis().set_visible(False)

fig.set_figheight(1)

fig.savefig("../../graphs/crawl_timeline.pdf",bbox_inches="tight")

In [None]:
sns.color_palette("tab10")[1]