# Update data

This notebook downlads recent GitHub activity for a number of organizations.

It will extract all issues, PRs, and comments that were updated within a
window of interest. It will then save them to disk as CSV files.

In [1]:
import requests
import pandas as pd
import numpy as np
import os
from datetime import timedelta

In [2]:
fmt = "{:%Y-%m-%d}"

# Can optionally use number of days to choose dates
n_days = 365
end_date = fmt.format(pd.datetime.today())
start_date = fmt.format(pd.datetime.today() - timedelta(days=n_days))
update_data = False

github_orgs = ["jupyterhub", "jupyter", "jupyterlab", "jupyter-widgets", "ipython", "binder-examples", "nteract"]

In [3]:
# Parameters
end_date = "2019-07-23"
start_date = "2019-06-23"
renderer = "kaggle"
update_data = True


In [4]:
if update_data is True:
    # Load in previous data if we have it
    path_prs = './data/prs.csv'
    path_issues = './data/issues.csv'
    path_comments = './data/comments.csv'
    prs_old = pd.read_csv(path_prs, index_col=0)
    issues_old = pd.read_csv(path_issues, index_col=0)
    comments_old = pd.read_csv(path_comments, index_col=0)

    # The latest updated time, we'll update `start_date` so we don't re-download unnecessarily
    latest_date = pd.to_datetime(comments_old['updatedAt'].max())
    start_date = fmt.format(latest_date)

In [5]:
# Calculate number of days to include in plots
n_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days

In [6]:
comments_query = """\
        comments(last: 50) {
          edges {
            node {
              authorAssociation
              createdAt
              updatedAt
              url
              author {
                login
              }
            }
          }
        }
"""

base_elements = """\
        state
        id
        title
        url
        createdAt
        updatedAt
        closedAt
        number
        authorAssociation
        author {
          login
        }
"""

gql_template = """\
{{
  search({query}) {{
    issueCount
    pageInfo {{
        endCursor
        hasNextPage
    }}
    nodes {{
      ... on PullRequest {{
        {base_elements}
        mergedBy {{
          login
        }}
        {comments}
      }}
      ... on Issue {{
        {base_elements}
        {comments}
      }}
    }}
  }}
}}
"""

In [7]:
# Define our query object that we'll re-use for github search
class GitHubGraphQlQuery():
    def __init__(self, query):
        self.query = query
        self.headers = {"Authorization": "Bearer %s" % os.environ['GITHUB_ACCESS_TOKEN']}
        self.gql_template = gql_template

    def request(self, n_pages=100, n_per_page=50):
        self.raw_data = []
        for ii in range(n_pages):
            search_query = ["first: %s" % n_per_page, 'query: "%s"' % self.query, 'type: ISSUE']
            if ii != 0:
                search_query.append('after: "%s"' % pageInfo['endCursor'])

            this_query = self.gql_template.format(
                query=', '.join(search_query),
                comments=comments_query,
                base_elements=base_elements
            )
            request = requests.post('https://api.github.com/graphql', json={'query': this_query}, headers=self.headers)
            if request.status_code != 200:
                raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, this_query))
            if "errors" in request.json().keys():
                raise Exception("Query failed to run with error {}. {}".format(request.json()['errors'], this_query))
            self.request = request

            # Parse the response
            json = request.json()['data']['search']
            if ii == 0:
                print("Found {} items, which will take {} pages".format(json['issueCount'], int(np.ceil(json['issueCount'] / n_per_page))))
            self.raw_data.append(json)
            pageInfo = json['pageInfo']
            self.last_query = this_query
            if pageInfo['hasNextPage'] is False:
                break
        
        if self.raw_data[0]['issueCount'] == 0:
            print("Found no entries for query {}".format(self.query))
            self.data = None
            return
        
        # Add some extra fields
        self.data = pd.DataFrame([jj for ii in self.raw_data for jj in ii['nodes']])
        self.data['author'] = self.data['author'].map(lambda a: a['login'] if a is not None else a)
        self.data['org'] = self.data['url'].map(lambda a: a.split('/')[3])
        self.data['repo'] = self.data['url'].map(lambda a: a.split('/')[4])

In [8]:
def extract_comments(comments):
    list_of_comments = [ii['edges'] for ii in comments]
    comments = [jj['node'] for ii in list_of_comments for jj in ii]
    comments = pd.DataFrame(comments)
    comments['author'] = comments['author'].map(lambda a: a['login'] if a is not None else a)
    
    # Parse some data about the comments
    url_parts = [ii.split('/') for ii in comments['url'].values]
    url_parts = np.array([(ii[3], ii[4], ii[6]) for ii in url_parts])
    orgs, repos, url_parts = url_parts.T

    issue_id = [ii.split('#')[0] for ii in url_parts]
    comment_id = [ii.split('-')[-1] for ii in url_parts]

    # Assign new variables
    comments['org'] = orgs
    comments['repo'] = repos
    comments['issue_id'] = issue_id
    comments['id'] = comment_id
    return comments

# GitHub activity

Jupyter also has lots of activity across GitHub repositories. The following sections contain
overviews of recent activity across the following GitHub organizations:

## Update issues

In [9]:
responses = []
for org in github_orgs:
    query_issues = f"is:issue user:{org} updated:{start_date}..{end_date}"
    ghq = GitHubGraphQlQuery(query_issues)
    ghq.request()
    if ghq.data is None:
        continue
    responses.append(ghq)

issues = pd.concat([ii.data for ii in responses])
issues_comments = issues.pop("comments")
issues_comments = extract_comments(issues_comments)

Found 30 items, which will take 1 pages


Found 32 items, which will take 1 pages


Found 37 items, which will take 1 pages


Found 4 items, which will take 1 pages


Found 7 items, which will take 1 pages


Found 0 items, which will take 0 pages
Found no entries for query is:issue user:binder-examples updated:2019-07-21..2019-07-23


Found 22 items, which will take 1 pages


## Update PRs

In [10]:
responses = []
for org in github_orgs:
    query_prs = f"is:pr user:{org} created:{start_date}..{end_date}"
    ghq = GitHubGraphQlQuery(query_prs)
    ghq.request()
    if ghq.data is None:
        continue
    responses.append(ghq)
    
prs = pd.concat([ii.data for ii in responses])
prs_comments = prs.pop('comments')
prs_comments = extract_comments(prs_comments)

Found 2 items, which will take 1 pages


Found 4 items, which will take 1 pages


Found 9 items, which will take 1 pages


Found 0 items, which will take 0 pages
Found no entries for query is:pr user:jupyter-widgets created:2019-07-21..2019-07-23


Found 3 items, which will take 1 pages


Found 0 items, which will take 0 pages
Found no entries for query is:pr user:binder-examples created:2019-07-21..2019-07-23


Found 2 items, which will take 1 pages


In [11]:
# Add a PR-specific field for closed PRs
prs['mergedBy'] = prs['mergedBy'].map(lambda a: a['login'] if a is not None else None)

## Combine comments

In [12]:
comments = pd.concat([prs_comments, issues_comments])

# Only keep the comments within our window of interest
comments = comments.query('updatedAt > @start_date and updatedAt < @end_date')

## Update the data

In [13]:
if update_data is True:
    comments_new = pd.concat([comments_old, comments]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
    issues_new = pd.concat([issues_old, issues]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
    prs_new = pd.concat([prs_old, prs]).drop_duplicates(subset=['id'], keep='last').sort_values('createdAt', ascending=False)
else:
    comments_new = comments
    issues_new = issues
    prs_new = prs

## Save the data

In [14]:
prs_new.to_csv('./data/prs.csv')
issues_new.to_csv('./data/issues.csv')
comments_new.to_csv('./data/comments.csv')

## View the data

In [15]:
issues_new.head(10)

Unnamed: 0,author,authorAssociation,closedAt,createdAt,id,number,state,title,updatedAt,url,org,repo
0,ericdill,NONE,,2019-07-23T16:25:28Z,MDU6SXNzdWU0NzE3OTUwMzg=,416,OPEN,Adding support for activating conda environmen...,2019-07-23T16:28:45Z,https://github.com/ipython/ipykernel/issues/416,ipython,ipykernel
0,psychemedia,NONE,,2019-07-23T16:18:39Z,MDU6SXNzdWU0NzE3OTE4NTk=,1166,OPEN,Struggling with install - formgrader 404,2019-07-23T16:22:07Z,https://github.com/jupyter/nbgrader/issues/1166,jupyter,nbgrader
0,rkdarst,NONE,,2019-07-23T15:22:52Z,MDU6SXNzdWU0NzE3NjI3MjI=,339,OPEN,Wierdness in ordering of pre_spawn_hook and lo...,2019-07-23T15:22:52Z,https://github.com/jupyterhub/kubespawner/issu...,jupyterhub,kubespawner
0,dmarth,NONE,,2019-07-23T14:24:33Z,MDU6SXNzdWU0NzE3Mjg2NzU=,6874,OPEN,Filebrowser extension: configurable initial path,2019-07-23T14:24:33Z,https://github.com/jupyterlab/jupyterlab/issue...,jupyterlab,jupyterlab
1,vijaysaimutyala,NONE,,2019-07-23T12:53:05Z,MDU6SXNzdWU0NzE2Nzk0OTk=,4778,OPEN,Get the output of the executed cell in Jupyter...,2019-07-23T12:56:30Z,https://github.com/jupyter/notebook/issues/4778,jupyter,notebook
1,JulianTBZ,NONE,,2019-07-23T09:00:40Z,MDU6SXNzdWU0NzE1NzExNzA=,278,OPEN,Tenant-ID,2019-07-23T09:00:40Z,https://github.com/jupyterhub/oauthenticator/i...,jupyterhub,oauthenticator
1,daejong123,NONE,,2019-07-23T05:44:57Z,MDU6SXNzdWU0NzE0OTY5NTQ=,6872,OPEN,How simultaneously executing code in terminal?,2019-07-23T05:44:57Z,https://github.com/jupyterlab/jupyterlab/issue...,jupyterlab,jupyterlab
2,jaipreet-s,NONE,,2019-07-23T03:34:09Z,MDU6SXNzdWU0NzE0NzAwNDg=,8,OPEN,JupyterLab Pull Requests Extension,2019-07-23T03:34:09Z,https://github.com/jupyterlab/team-compass/iss...,jupyterlab,team-compass
0,todo,NONE,,2019-07-23T02:46:51Z,MDU6SXNzdWU0NzE0NTk4MTI=,4499,OPEN,All the `<li>` below that have role button sho...,2019-07-23T02:46:52Z,https://github.com/nteract/nteract/issues/4499,nteract,nteract
2,fomightez,CONTRIBUTOR,,2019-07-23T02:31:30Z,MDU6SXNzdWU0NzE0NTY2MTk=,751,OPEN,runtime.txt for installing R and requirements....,2019-07-23T14:38:50Z,https://github.com/jupyter/repo2docker/issues/751,jupyter,repo2docker


In [16]:
prs_new.head(10)

Unnamed: 0,author,authorAssociation,closedAt,createdAt,id,mergedBy,number,state,title,updatedAt,url,org,repo
0,raethlein,FIRST_TIME_CONTRIBUTOR,,2019-07-23T16:23:16Z,MDExOlB1bGxSZXF1ZXN0MzAwMzc1MTAy,,279,OPEN,Add clarifying comment into README code block,2019-07-23T16:23:16Z,https://github.com/jupyterhub/oauthenticator/p...,jupyterhub,oauthenticator
0,dmarth,FIRST_TIME_CONTRIBUTOR,,2019-07-23T14:30:16Z,MDExOlB1bGxSZXF1ZXN0MzAwMzI1MDgy,,6875,OPEN,accept query parameter to optionally change fi...,2019-07-23T14:30:30Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab
1,van-8,MEMBER,,2019-07-23T11:12:31Z,MDExOlB1bGxSZXF1ZXN0MzAwMjQ1Mjg4,,6873,OPEN,Update max CSV size for Firefox 68,2019-07-23T15:34:29Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab
2,nibheis,FIRST_TIME_CONTRIBUTOR,,2019-07-23T08:48:43Z,MDExOlB1bGxSZXF1ZXN0MzAwMTg5NzU2,,385,OPEN,Fix remote checkout,2019-07-23T08:48:43Z,https://github.com/jupyterlab/jupyterlab-git/p...,jupyterlab,jupyterlab-git
0,betatim,MEMBER,,2019-07-23T05:35:05Z,MDExOlB1bGxSZXF1ZXN0MzAwMTM0NDYz,,752,OPEN,[WIP] Call parent preassemble scripts methods,2019-07-23T16:24:34Z,https://github.com/jupyter/repo2docker/pull/752,jupyter,repo2docker
0,LittleLightLittleFire,NONE,,2019-07-23T04:40:41Z,MDExOlB1bGxSZXF1ZXN0MzAwMTI1Nzg1,,11833,OPEN,Sets error_before_exec to be the value of the ...,2019-07-23T05:26:16Z,https://github.com/ipython/ipython/pull/11833,ipython,ipython
1,esevan,FIRST_TIME_CONTRIBUTOR,,2019-07-23T01:33:26Z,MDExOlB1bGxSZXF1ZXN0MzAwMDk1ODM3,,4777,OPEN,Attempt to re-establish websocket connection t...,2019-07-23T01:33:26Z,https://github.com/jupyter/notebook/pull/4777,jupyter,notebook
1,choldgraf,MEMBER,2019-07-23T01:30:09Z,2019-07-23T01:29:00Z,MDExOlB1bGxSZXF1ZXN0MzAwMDk1MTk4,choldgraf,191,MERGED,adding team meeting,2019-07-23T01:30:10Z,https://github.com/jupyterhub/team-compass/pul...,jupyterhub,team-compass
3,saulshanabrook,MEMBER,,2019-07-22T21:17:29Z,MDExOlB1bGxSZXF1ZXN0MzAwMDQyNzM3,,6871,OPEN,Add datastore support,2019-07-23T16:01:02Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab
4,ian-r-rose,MEMBER,,2019-07-22T20:32:08Z,MDExOlB1bGxSZXF1ZXN0MzAwMDI3NzE2,,6870,OPEN,Fix browser tests,2019-07-23T13:45:26Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab


In [17]:
comments_new.head(10)

Unnamed: 0,author,authorAssociation,createdAt,updatedAt,url,org,repo,issue_id,id
393,michaele4321,NONE,2019-07-22T23:57:47Z,2019-07-22T23:57:47Z,https://github.com/jupyterlab/jupyterlab/issue...,jupyterlab,jupyterlab,6827,513998553
525,jmmshn,NONE,2019-07-22T23:39:57Z,2019-07-22T23:39:57Z,https://github.com/jupyter-widgets/pythreejs/i...,jupyter-widgets,pythreejs,282,513994907
193,esevan,CONTRIBUTOR,2019-07-22T23:15:35Z,2019-07-22T23:15:35Z,https://github.com/jupyter/nb2kg/issues/39#iss...,jupyter,nb2kg,39,513989823
28,rgbkrk,MEMBER,2019-07-22T23:03:42Z,2019-07-22T23:03:42Z,https://github.com/nteract/meeting-minutes/pul...,nteract,meeting-minutes,30,513987396
38,vilhelmen,CONTRIBUTOR,2019-07-22T22:57:07Z,2019-07-22T22:57:29Z,https://github.com/jupyterhub/jupyterhub/issue...,jupyterhub,jupyterhub,2598,513986028
281,psychemedia,NONE,2019-07-22T22:02:19Z,2019-07-22T22:02:19Z,https://github.com/jupyter/nbgrader/issues/974...,jupyter,nbgrader,974,513972099
4,jupyterlab-dev-mode,NONE,2019-07-22T21:17:30Z,2019-07-22T21:17:30Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab,6871,513958440
280,lwasser,NONE,2019-07-22T21:14:26Z,2019-07-22T21:14:26Z,https://github.com/jupyter/nbgrader/issues/974...,jupyter,nbgrader,974,513957429
350,telamonian,MEMBER,2019-07-22T21:12:09Z,2019-07-22T21:12:09Z,https://github.com/jupyterlab/jupyterlab/issue...,jupyterlab,jupyterlab,6869,513956718
6,ian-r-rose,MEMBER,2019-07-22T21:08:36Z,2019-07-22T21:14:40Z,https://github.com/jupyterlab/jupyterlab/pull/...,jupyterlab,jupyterlab,6870,513955501
