# Update data

This notebook downlads recent GitHub activity for a number of organizations.

It will extract all issues, PRs, and comments that were updated within a
window of interest. It will then save them to disk as CSV files.

In [1]:
import requests
import pandas as pd
import numpy as np
import os
from datetime import timedelta

In [10]:
fmt = "{:%Y-%m-%d}"

# Can optionally use number of days to choose dates
n_days = 20
end_date = fmt.format(pd.datetime.today())
start_date = fmt.format(pd.datetime.today() - timedelta(days=n_days))

github_orgs = ["jupyterhub", "jupyter", "jupyterlab", "jupyter-widgets", "ipython", "binder-examples", "nteract"]

In [11]:
# Calculate number of days to include in plots
n_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days

In [12]:
comments_query = """\
        comments(last: 50) {
          edges {
            node {
              authorAssociation
              createdAt
              updatedAt
              url
              author {
                login
              }
            }
          }
        }
"""

base_elements = """\
        state
        id
        title
        url
        createdAt
        updatedAt
        closedAt
        number
        authorAssociation
        author {
          login
        }
"""

gql_template = """\
{{
  search({query}) {{
    issueCount
    pageInfo {{
        endCursor
        hasNextPage
    }}
    nodes {{
      ... on PullRequest {{
        {base_elements}
        mergedBy {{
          login
        }}
        {comments}
      }}
      ... on Issue {{
        {base_elements}
        {comments}
      }}
    }}
  }}
}}
"""

In [13]:
# Define our query object that we'll re-use for github search
class GitHubGraphQlQuery():
    def __init__(self, query):
        self.query = query
        self.headers = {"Authorization": "Bearer %s" % os.environ['GITHUB_ACCESS_TOKEN']}
        self.gql_template = gql_template

    def request(self, n_pages=100, n_per_page=50):
        self.raw_data = []
        for ii in range(n_pages):
            search_query = ["first: %s" % n_per_page, 'query: "%s"' % self.query, 'type: ISSUE']
            if ii != 0:
                search_query.append('after: "%s"' % pageInfo['endCursor'])

            this_query = self.gql_template.format(
                query=', '.join(search_query),
                comments=comments_query,
                base_elements=base_elements
            )
            request = requests.post('https://api.github.com/graphql', json={'query': this_query}, headers=self.headers)
            if request.status_code != 200:
                raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, this_query))
            if "errors" in request.json().keys():
                raise Exception("Query failed to run with error {}. {}".format(request.json()['errors'], this_query))
            self.request = request

            # Parse the response
            json = request.json()['data']['search']
            if ii == 0:
                print("Found {} items, which will take {} pages".format(json['issueCount'], int(np.ceil(json['issueCount'] / n_per_page))))
            self.raw_data.append(json)
            pageInfo = json['pageInfo']
            self.last_query = this_query
            if pageInfo['hasNextPage'] is False:
                break
        
        if self.raw_data[0]['issueCount'] == 0:
            print("Found no entries for query {}".format(self.query))
            self.data = None
            return
        
        # Add some extra fields
        self.data = pd.DataFrame([jj for ii in self.raw_data for jj in ii['nodes']])
        self.data['author'] = self.data['author'].map(lambda a: a['login'] if a is not None else a)
        self.data['org'] = self.data['url'].map(lambda a: a.split('/')[3])
        self.data['repo'] = self.data['url'].map(lambda a: a.split('/')[4])

In [14]:
def extract_comments(comments):
    list_of_comments = [ii['edges'] for ii in comments]
    comments = [jj['node'] for ii in list_of_comments for jj in ii]
    comments = pd.DataFrame(comments)
    comments['author'] = comments['author'].map(lambda a: a['login'] if a is not None else a)
    
    # Parse some data about the comments
    url_parts = [ii.split('/') for ii in comments['url'].values]
    url_parts = np.array([(ii[3], ii[4], ii[6]) for ii in url_parts])
    orgs, repos, url_parts = url_parts.T

    issue_id = [ii.split('#')[0] for ii in url_parts]
    comment_id = [ii.split('-')[-1] for ii in url_parts]

    # Assign new variables
    comments['org'] = orgs
    comments['repo'] = repos
    comments['issue_id'] = issue_id
    comments['id'] = comment_id
    return comments

# GitHub activity

Jupyter also has lots of activity across GitHub repositories. The following sections contain
overviews of recent activity across the following GitHub organizations:

## Update issues

In [15]:
responses = []
for org in github_orgs:
    query_issues = f"is:issue user:{org} updated:{start_date}..{end_date}"
    ghq = GitHubGraphQlQuery(query_issues)
    ghq.request()
    if ghq.data is None:
        continue
    responses.append(ghq)

issues = pd.concat([ii.data for ii in responses])
issues_comments = issues.pop("comments")
issues_comments = extract_comments(issues_comments)

Found 167 items, which will take 4 pages
Found 245 items, which will take 5 pages
Found 198 items, which will take 4 pages
Found 64 items, which will take 2 pages
Found 51 items, which will take 2 pages
Found 2 items, which will take 1 pages
Found 190 items, which will take 4 pages


## Update PRs

In [16]:
responses = []
for org in github_orgs:
    query_prs = f"is:pr user:{org} created:{start_date}..{end_date}"
    ghq = GitHubGraphQlQuery(query_prs)
    ghq.request()
    if ghq.data is None:
        continue
    responses.append(ghq)
    
prs = pd.concat([ii.data for ii in responses])
prs_comments = prs.pop('comments')
prs_comments = extract_comments(prs_comments)

Found 78 items, which will take 2 pages
Found 99 items, which will take 2 pages
Found 94 items, which will take 2 pages
Found 36 items, which will take 1 pages
Found 6 items, which will take 1 pages
Found 1 items, which will take 1 pages
Found 60 items, which will take 2 pages


In [17]:
# Add a PR-specific field for closed PRs
prs['mergedBy'] = prs['mergedBy'].map(lambda a: a['login'] if a is not None else None)

## Combine comments

In [None]:
comments = pd.concat([prs_comments, issues_comments])

# Only keep the comments within our window of interest
comments = comments.query('updatedAt > @start_date and updatedAt < @end_date')

## Save the data

In [None]:
prs.to_csv('./data/prs.csv')
issues.to_csv('./data/issues.csv')
comments.to_csv('./data/comments.csv')

## View the data

In [None]:
issues.head(10)

In [None]:
prs.head(10)

In [None]:
comments.query('updatedAt > @start_date')

In [None]:
comments.head(10)