In [5]:
import github_activity as ga
from datetime import date
from dateutil.relativedelta import relativedelta
from myst_nb import glue
import seaborn as sns
import pandas as pd
import numpy as np
import altair as alt
from markdown import markdown
from IPython.display import Markdown
from ipywidgets.widgets import HTML, Tab
from ipywidgets import widgets
from datetime import timedelta
from matplotlib import pyplot as plt
import os.path as op

from warnings import simplefilter
simplefilter('ignore')

In [14]:
github_org = "jupyterhub"
top_n_repos = 15
n_days = 10

## Variables

In [9]:
stop = date.today()
start = date.today() - relativedelta(days=n_days)

# Strings for use in queries
start_date = f"{start:%Y-%m-%d}"
stop_date = f"{stop:%Y-%m-%d}"

## Fetch the raw data and load data we need

In [12]:
raw_data = ga.get_activity(github_org, start_date)
bot_names = pd.read_csv('../../data/bot_names.csv')['names'].tolist()
data = raw_data.copy()

# Prepare our data
data["kind"] = data["url"].map(lambda a: "issue" if "issues/" in a else "pr")
data["mergedBy"] = data["mergedBy"].map(lambda a: a["login"] if not isinstance(a, (float, type(None))) else None)

prs = data.query("kind == 'pr'")
issues = data.query("kind == 'issue'")

# Pull out the comments
comments = []
for _, irow in data.iterrows():
    for icomment in irow['comments']['edges']:
        icomment = icomment["node"].copy()
        icomment["author"] = icomment["author"]["login"] if icomment["author"] else None
        icomment["org"] = irow["org"]
        icomment["repo"] = irow["repo"]
        icomment["id"] = irow["id"]
        comments.append(pd.Series(icomment))
comments = pd.DataFrame(comments)

# Clean up
for idata in [prs, comments, issues]:
    idata.query("author not in @bot_names", inplace=True)
    idata.drop_duplicates(subset=["url"], inplace=True)

# What are the top N repos, we will only plot these in the full data plots
top_commented_repos = comments.groupby("repo").count().sort_values("createdAt", ascending=False)['createdAt']
use_repos = top_commented_repos.head(top_n_repos).index.tolist()

Running search query:
user:jupyterhub


Found 107 items, which will take 3 pages


Downloading::   0%|          | 0/107 [00:00<?, ?issues/s]

Found 99 items, which will take 2 pages


Downloading::   0%|          | 0/99 [00:00<?, ?issues/s]

## Save data to disk

In [13]:
from pathlib import Path
path_data = Path("../data/")
path_data.mkdir(exist_ok=True)
for kind, idata in [("comments", comments), ("prs", prs), ("issues", issues)]:
    path_data = Path(f"../data/{kind}.csv")
    if path_data.exists():
        idata = pd.read_csv(path_data).append(idata)
    idata = idata.drop_duplicates(subset=["url"])
    idata.to_csv(path_data, index=None)