This notebook is designed to get information about a GitHub user from the GraphQL API.

Assumptions / requirements:
- `gh_key`: assumes you have a file in this directory called `gh_key` with your API token. You can change this and get your token from elsewhere in the cell below.
- `start_date` and `end_date`: these are specified in the cell below and cannot be more than 1 year apart. You can run an additional query if you need more than 1 year of data.
- `user_login`: a variable that specifies the user that you want to gather data about in the cell below.

In [248]:
# Copyright 2022 VMware, Inc.
# SPDX-License-Identifier: BSD-2-Clause

import requests
import json
import pandas as pd
from pandas import json_normalize
from datetime import datetime

with open('gh_key', 'r') as kf:
    api_token = kf.readline().rstrip() # remove newline & trailing whitespace
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token %s' % api_token}

start_date = datetime(2021, 1, 1).isoformat() #isoformat required for json serialization
end_date = datetime(2021, 12, 31).isoformat()
user_login = 'jberkus'
variables = {"user_login": user_login, "start_date": start_date, "end_date": end_date}

In [249]:
def run_query(url, headers, variables, make_query):
    '''Basic function to run any of the queries below'''
    import requests
    import json
    
    query = make_query()
    r = requests.post(url=url, json={'query': query, 'variables': variables}, headers=headers)
    json_data = json.loads(r.text)
    
    return json_data

# Basic Info

Get basic information about a user and the number of various types of contributions they have made during the specified time.

In [250]:
def basic_info_query():
    return """query basic_info_user_query($user_login: String!, $start_date: DateTime!, $end_date: DateTime!){
             user(login: $user_login) {
                name
                bio
                location
                company
                twitterUsername
                websiteUrl
                contributionsCollection(from: $start_date, to: $end_date){
                    totalCommitContributions
                    totalIssueContributions
                    totalPullRequestContributions
                    totalPullRequestReviewContributions
                    totalRepositoriesWithContributedCommits
                    totalRepositoriesWithContributedIssues
                    totalRepositoriesWithContributedPullRequestReviews
                    totalRepositoriesWithContributedPullRequests
                }
            }
            }"""

basic_info_json_data = run_query(url, headers, variables, basic_info_query)
basic_info_json_data['data']['user']

{'name': 'Josh Berkus',
 'bio': 'Twitter: @fuzzychef\r\nKubernetes Slack: jberkus\r\nIRC.freenode.net: jberkus',
 'location': 'Portland, OR',
 'company': 'Red Hat',
 'twitterUsername': None,
 'websiteUrl': 'http://jberkus.github.io',
 'contributionsCollection': {'totalCommitContributions': 280,
  'totalIssueContributions': 65,
  'totalPullRequestContributions': 96,
  'totalPullRequestReviewContributions': 28,
  'totalRepositoriesWithContributedCommits': 31,
  'totalRepositoriesWithContributedIssues': 23,
  'totalRepositoriesWithContributedPullRequestReviews': 14,
  'totalRepositoriesWithContributedPullRequests': 24}}

# Pull Requests

Get a list of a user's pull requests during the specified time with details about the PRs.

In [251]:
def pr_info_query(after_cursor = None):
    return """query pr_info_query($user_login: String!, $start_date: DateTime!, $end_date: DateTime!){
             user(login: $user_login) {
                contributionsCollection(from: $start_date, to: $end_date){
                    pullRequestContributions(first: 100 after: AFTER){
                        pageInfo {
                           hasNextPage
                           endCursor
                        }
                        nodes{
                            pullRequest{
                                repository{
                                    nameWithOwner
                                }
                                createdAt
                                updatedAt
                                changedFiles
                                additions
                                deletions
                                state
                            }
                        }
                    }
                }
            }
            }""".replace(
        "AFTER", '"{}"'.format(after_cursor) if after_cursor else "null"
    )

has_next_page = True
after_cursor = None

pr_info_json_data = {}
    
while has_next_page:
    temp_json_data = run_query(url, headers, variables, lambda: pr_info_query(after_cursor))
    
    if not after_cursor:
        pr_info_json_data = temp_json_data["data"]["user"]["contributionsCollection"]["pullRequestContributions"]["nodes"]
    else:
        for pr in temp_json_data["data"]["user"]["contributionsCollection"]["pullRequestContributions"]["nodes"]:
            if pr:
                pr_info_json_data.append(pr)
       
    has_next_page = temp_json_data["data"]["user"]["contributionsCollection"]["pullRequestContributions"]["pageInfo"]["hasNextPage"]
    after_cursor = temp_json_data["data"]["user"]["contributionsCollection"]["pullRequestContributions"]["pageInfo"]["endCursor"]

if pr_info_json_data:
    pr_info_json_data = list(filter(None, pr_info_json_data))
    pr_df = pd.DataFrame(pr_info_json_data)
    pr_df = pr_df.join(json_normalize(pr_df["pullRequest"].tolist()))
    pr_df = pr_df.drop("pullRequest", axis=1)
else:
    pr_df = "No Pull Requests"

pr_df

Unnamed: 0,createdAt,updatedAt,changedFiles,additions,deletions,state,repository.nameWithOwner
0,2021-12-17T21:02:36Z,2022-01-05T13:52:35Z,1,3,2,MERGED,kubernetes/community
1,2021-12-17T18:25:21Z,2021-12-17T18:29:53Z,3,59,0,OPEN,cncf/cloud-native-community-cookbook
2,2021-12-14T01:58:11Z,2021-12-14T03:00:04Z,1,5,4,MERGED,kubernetes/contributor-site
3,2021-12-14T01:54:02Z,2021-12-14T16:30:48Z,1,0,10,MERGED,kubernetes/contributor-site
4,2021-12-10T01:12:54Z,2021-12-12T22:15:58Z,1,4168,0,MERGED,kubernetes/community
...,...,...,...,...,...,...,...
91,2021-03-19T20:27:23Z,2021-04-26T13:57:28Z,2,11,2,CLOSED,containers/podman
92,2021-03-17T23:38:44Z,2021-03-18T09:41:28Z,1,0,6,MERGED,kubernetes/sig-release
93,2021-02-24T00:20:46Z,2021-02-25T14:29:30Z,1,6,3,MERGED,kubernetes/community
94,2021-01-15T23:09:12Z,2021-03-18T17:28:11Z,2,174,0,MERGED,cncf/project-template


# Respositories Contributed to

The list of repositories this user has contributed to. 

Note: The repositoriesContributedTo object does not take a start / end time and the API documentation states that this is "A list of repositories that the user recently contributed to.", but doesn't actually define "recently". Based on how they define this elsewhere and the data I'm seeing, I suspect that this is the past 1 year of data.

In [252]:
def repo_info_query(after_cursor = None):
    return """query repo_info_query($user_login: String!){
             user(login: $user_login) {
                repositoriesContributedTo(first: 100 includeUserRepositories: true after: AFTER){
                    pageInfo {
                        hasNextPage
                        endCursor
                    }
                    totalCount
                    nodes{
                        nameWithOwner
                    }
                }
            }    
            }""".replace(
        "AFTER", '"{}"'.format(after_cursor) if after_cursor else "null"
    )

has_next_page = True
after_cursor = None

repo_info_json_data = {}
    
while has_next_page:
    temp_json_data = run_query(url, headers, variables, lambda: repo_info_query(after_cursor))

    if not after_cursor:
        repo_info_json_data = temp_json_data["data"]["user"]["repositoriesContributedTo"]["nodes"]
    else:
        for pr in temp_json_data["data"]["user"]["repositoriesContributedTo"]["nodes"]:
            if pr:
                repo_info_json_data.append(pr)

    has_next_page = temp_json_data["data"]["user"]["repositoriesContributedTo"]["pageInfo"]["hasNextPage"]
    after_cursor = temp_json_data["data"]["user"]["repositoriesContributedTo"]["pageInfo"]["endCursor"]
  
print(len(repo_info_json_data))
repo_info_json_data

55


[{'nameWithOwner': 'projectatomic/atomic-site'},
 {'nameWithOwner': 'apache/superset'},
 {'nameWithOwner': 'jberkus/jberkus.github.io'},
 {'nameWithOwner': 'yo7/count-word'},
 {'nameWithOwner': 'cncf/toc'},
 {'nameWithOwner': 'kubernetes/website'},
 {'nameWithOwner': 'kubernetes/test-infra'},
 {'nameWithOwner': 'kubernetes/enhancements'},
 {'nameWithOwner': 'kubernetes/community'},
 {'nameWithOwner': 'cncf/foundation'},
 {'nameWithOwner': 'kubernetes/k8s.io'},
 {'nameWithOwner': 'kubevirt/community'},
 {'nameWithOwner': 'kubevirt/kubevirt'},
 {'nameWithOwner': 'chaoss/augur'},
 {'nameWithOwner': 'cncf/mentoring'},
 {'nameWithOwner': 'kubevirt/kubevirt.github.io'},
 {'nameWithOwner': 'kubernetes/sig-release'},
 {'nameWithOwner': 'cncf/devstats'},
 {'nameWithOwner': 'istio/community'},
 {'nameWithOwner': 'kubernetes/steering'},
 {'nameWithOwner': 'containers/podman'},
 {'nameWithOwner': 'kubernetes-sigs/lwkd'},
 {'nameWithOwner': 'jberkus/annotated.conf'},
 {'nameWithOwner': 'kubernetes/

# Repositories with Total Count of Commits

This is another way of getting at the repository data, but this time with the number of commits. 

Note: You can only get this data for a maximum of 100 repos.

Note: The number of repos is different from above, but looking at the data, I think maybe that commits to forks are excluded.

In [253]:
def repo_commits_info_query(after_cursor = None):
    return """query pr_info_query($user_login: String!, $start_date: DateTime!, $end_date: DateTime!){
             user(login: $user_login) {
                contributionsCollection(from: $start_date, to: $end_date){
                    commitContributionsByRepository(maxRepositories: 100){
                        repository{
                            nameWithOwner
                        }
                        contributions{
                            totalCount
                        }
                    }
                    }
                }
                }""".replace(
        "AFTER", '"{}"'.format(after_cursor) if after_cursor else "null"
    )

repo_commits_info_json_data = run_query(url, headers, variables, lambda: repo_commits_info_query(after_cursor))

repo_commits_info_json_data
                    
if repo_commits_info_json_data:
    repo_commits_df = pd.DataFrame(repo_commits_info_json_data['data']['user']['contributionsCollection']['commitContributionsByRepository'])
    repo_commits_df['repo'] = repo_commits_df.repository.str['nameWithOwner']
    repo_commits_df['commits'] = repo_commits_df.contributions.str['totalCount']
    repo_commits_df = repo_commits_df.drop("repository", axis=1)
    repo_commits_df = repo_commits_df.drop("contributions", axis=1)
else:
    repo_commits_df = "No Commits"
    
repo_commits_df

Unnamed: 0,repo,commits
0,kubernetes-sigs/lwkd,71
1,elekto-io/elekto,33
2,containers/ContainerPlumbing,28
3,elekto-io/docs,24
4,kubernetes/community,23
5,cncf/tag-contributor-strategy,22
6,knative/community,13
7,kubernetes/k8s.io,11
8,elekto-io/elekto.meta.test,11
9,carolynvs/contributor-strategy-talks,7


# Older Experiment getting the data above as one big query

Note: Data gathered from this query is most likely incomplete. Since this was a quick experiment to better understand the User object, I simply gathered the first 100 of several objects with no pagination, and I hardcoded in the user_login and start / end dates.

In [234]:
def make_query():
    return """query user_query($user_login: String!, $start_date: DateTime!, $end_date: DateTime!){
             user(login: $user_login) {
                name
                contributionsCollection(from: $start_date, to: $end_date){
                    totalCommitContributions
                    totalIssueContributions
                    totalPullRequestContributions
                    totalPullRequestReviewContributions
                    totalRepositoriesWithContributedCommits
                    totalRepositoriesWithContributedIssues
                    totalRepositoriesWithContributedPullRequestReviews
                    totalRepositoriesWithContributedPullRequests
                    pullRequestContributions(first: 100){
                        nodes{
                            pullRequest{
                                repository{
                                    nameWithOwner
                                }
                                createdAt
                                updatedAt
                                changedFiles
                                additions
                                deletions
                                state
                            }
                        }
                    }
                    commitContributionsByRepository{
                        repository{
                            nameWithOwner
                        }
                        contributions(first:100){
                            totalCount
                            nodes{
                                occurredAt
                                commitCount
                            }
                        }
                    }
                }
             }
             }"""

In [254]:
def get_user_data(api_token):
    import requests
    import json
    import pandas as pd
    from datetime import datetime

    url = 'https://api.github.com/graphql'
    headers = {'Authorization': 'token %s' % api_token}
    
    start_date = datetime(2021, 1, 1).isoformat() #isoformat required for json serialization
    end_date = datetime(2021, 12, 31).isoformat()
    
    query = make_query()
    
    user_login = "jberkus"

    variables = {"user_login": user_login, "start_date": start_date, "end_date": end_date}
    r = requests.post(url=url, json={'query': query, 'variables': variables}, headers=headers)
    json_data = json.loads(r.text)
    
    return json_data
    
json_data = get_user_data(api_token)

In [255]:
json_data

{'data': {'user': {'name': 'Josh Berkus',
   'contributionsCollection': {'totalCommitContributions': 280,
    'totalIssueContributions': 65,
    'totalPullRequestContributions': 96,
    'totalPullRequestReviewContributions': 28,
    'totalRepositoriesWithContributedCommits': 31,
    'totalRepositoriesWithContributedIssues': 23,
    'totalRepositoriesWithContributedPullRequestReviews': 14,
    'totalRepositoriesWithContributedPullRequests': 24,
    'pullRequestContributions': {'nodes': [{'pullRequest': {'repository': {'nameWithOwner': 'kubernetes/community'},
        'createdAt': '2021-12-17T21:02:36Z',
        'updatedAt': '2022-01-05T13:52:35Z',
        'changedFiles': 1,
        'additions': 3,
        'deletions': 2,
        'state': 'MERGED'}},
      {'pullRequest': {'repository': {'nameWithOwner': 'cncf/cloud-native-community-cookbook'},
        'createdAt': '2021-12-17T18:25:21Z',
        'updatedAt': '2021-12-17T18:29:53Z',
        'changedFiles': 3,
        'additions': 59,
   