# Update data

This notebook downlads recent GitHub activity for a number of organizations.

It will extract all issues, PRs, and comments that were updated within a
window of interest. It will then save them to disk as CSV files.

In [3]:
from update_mod import GitHubGraphQlQuery, extract_comments

In [4]:
import requests
import pandas as pd
import numpy as np
import os
from datetime import timedelta
from ipywidgets import widgets
from IPython.display import display
import os.path as op

In [5]:
fmt = "{:%Y-%m-%d}"

# Can optionally use number of days to choose dates
end_date = pd.datetime.today()

github_orgs = ["jupyterhub", "jupyter", "jupyterlab", "jupyter-widgets", "ipython", "binder-examples", "nteract", "voila-dashboards"]
# github_orgs = ['jupyter']

  end_date = pd.datetime.today()


In [6]:
# Calculate number of days to include in plots
end_date = pd.to_datetime(end_date)
end_date_str = fmt.format(end_date)

# GitHub activity

Jupyter also has lots of activity across GitHub repositories. The following sections contain
overviews of recent activity across the following GitHub organizations:

## Update all data

In [7]:
path_data = op.join('..', '..', 'data')

In [8]:
for org in github_orgs:
    print(f'===\n{org}\n===\n\n')
    # Load in previous data if we have it
    path_data_org = op.join(path_data, org)
    path_prs = op.join(path_data_org, 'prs.csv')
    path_issues = op.join(path_data_org, 'issues.csv')
    path_comments = op.join(path_data_org, 'comments.csv')
    prs_old = pd.read_csv(path_prs, index_col=0)
    issues_old = pd.read_csv(path_issues, index_col=0)
    comments_old = pd.read_csv(path_comments, index_col=0)

    # The latest updated time, we'll update `start_date_str` so we don't re-download unnecessarily
    latest_date = pd.to_datetime(comments_old['updatedAt'].max())
    start_date_str = fmt.format(latest_date)
    
    # Issues
    query_issues = f"is:issue user:{org} updated:{start_date_str}..{end_date_str}"
    ghq_issues = GitHubGraphQlQuery(query_issues)
    ghq_issues.request()
    if ghq_issues.data is None:
        continue

    issues = ghq_issues.data
    issues_comments = issues.pop("comments")
    issues_comments = extract_comments(issues_comments)
    
    # Pull Requests
    query_prs = f"is:pr user:{org} created:{start_date_str}..{end_date_str}"
    ghq_prs = GitHubGraphQlQuery(query_prs)
    ghq_prs.request()
    if ghq_prs.data is None:
        continue

    prs = ghq_prs.data
    prs_comments = prs.pop('comments')
    prs_comments = extract_comments(prs_comments)
    
    # Add a PR-specific field for closed PRs
    prs['mergedBy'] = prs['mergedBy'].map(lambda a: a['login'] if a is not None else None)
    
    # Extract the comments
    comments = pd.concat([prs_comments, issues_comments])

    # Only keep the comments within our window of interest
    comments = comments.query('updatedAt > @start_date_str and updatedAt < @end_date_str')

    # Update the data for this org
    comments_new = pd.concat([comments_old, comments]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
    issues_new = pd.concat([issues_old, issues]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
    prs_new = pd.concat([prs_old, prs]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
    
    # Drop all duplicates
    prs_new = prs_new.drop_duplicates(subset=['id'])
    issues_new = issues_new.drop_duplicates(subset=['id'])
    comments_new = comments_new.drop_duplicates(subset=['id'])
    
    # Save the data
    prs_new.to_csv(op.join(path_data_org, 'prs.csv'))
    issues_new.to_csv(op.join(path_data_org, 'issues.csv'))
    comments_new.to_csv(op.join(path_data_org, 'comments.csv'))

===
jupyterhub
===


Found 331 items, which will take 7 pages


IntProgress(value=0, description='Downloading:', max=7)

Found 152 items, which will take 4 pages


IntProgress(value=0, description='Downloading:', max=4)

===
jupyter
===


Found 511 items, which will take 11 pages


IntProgress(value=0, description='Downloading:', max=11)

Found 129 items, which will take 3 pages


IntProgress(value=0, description='Downloading:', max=3)

===
jupyterlab
===


Found 500 items, which will take 10 pages


IntProgress(value=0, description='Downloading:', max=10)

Found 241 items, which will take 5 pages


IntProgress(value=0, description='Downloading:', max=5)

===
jupyter-widgets
===


Found 139 items, which will take 3 pages


IntProgress(value=0, description='Downloading:', max=3)

Found 43 items, which will take 1 pages
===
ipython
===


Found 102 items, which will take 3 pages


IntProgress(value=0, description='Downloading:', max=3)

Found 51 items, which will take 2 pages


IntProgress(value=0, description='Downloading:', max=2)

===
binder-examples
===


Found 2 items, which will take 1 pages
Found 7 items, which will take 1 pages
===
nteract
===


Found 303 items, which will take 7 pages


IntProgress(value=0, description='Downloading:', max=7)

Found 152 items, which will take 4 pages


IntProgress(value=0, description='Downloading:', max=4)

===
voila-dashboards
===


Found 42 items, which will take 1 pages
Found 13 items, which will take 1 pages
