In [84]:
import pandas as pd
from subprocess import run, PIPE
from watchtower import comments_, issues_, commits_, reviews_
from glob import glob
import os.path as op
import shutil as sh
import os
auth = os.environ['GITHUB_API_TOKEN']

## Collect and save team membership info

In [135]:
from ruamel.yaml import YAML
from requests import get

url_jupyterhub = "https://raw.githubusercontent.com/jupyterhub/team-compass/master/docs/team/contributors-jupyterhub.yaml"
url_binder = "https://raw.githubusercontent.com/jupyterhub/team-compass/master/docs/team/contributors-binder.yaml"

yaml = YAML()
jupyterhub_team = yaml.load(get(url_jupyterhub).text)
binder_team = yaml.load(get(url_binder).text)

In [136]:
data = []
for ii in jupyterhub_team:
    data.append({'name': ii['name'], 'username': ii['handle'].strip('@'), 'team': 'jupyterhub'})
    
for ii in binder_team:
    data.append({'name': ii['name'], 'username': ii['handle'].strip('@'), 'team': ii['team']})

In [137]:
teams = pd.DataFrame(data)
teams.to_csv('../data/teams.csv', index=False)

## Helper funcs

In [85]:
def delete_data(org, repo, data_dir=data_folder):
    path_data = op.join(op.expanduser(data_dir), org, repo)
    if op.exists(path_data):
        sh.rmtree(path_data)
        print('Deleted {}/{}'.format(org, repo))
        
def list_data(data_dir=data_folder):
    all_data = []
    for folder in glob(op.join(op.expanduser(data_dir), '*', '*')):
        org, repo = folder.split(op.expanduser(data_dir))[-1].strip(os.sep).split(os.sep)[:2]
        data = comments_.load_comments(org, repo)
        date_col = 'created_at' if 'created_at' in data.columns else 'date'
        data = data[[date_col]].resample('Y', on=date_col).count()
        data['org'] = org
        data['repo'] = repo
        all_data.append(data.rename(columns={'created_at': 'count'}))
    all_data = pd.concat(all_data).set_index(['org', 'repo'], append=True)['count'].unstack('created_at')
    all_data.columns = all_data.columns.year
    return all_data

## Update data

In [86]:
from datetime import timedelta
date = pd.datetime.today() - timedelta(days=30)

In [11]:
# If we wish to update the data
update_data = True
if update_data is True:
    try:
        for org, repos in projects.items():
            print(org)
            for repo in repos:
                print('    ' + repo)
                run("python ../scripts/update_data.py {} {}".format(org, repo).split(), check=True, stderr=PIPE, stdout=PIPE)
    except Exception as ee:
        print(ee.stderr.decode())   

jupyter
    repo2docker
jupyterhub
    zero-to-jupyterhub-k8s
    the-littlest-jupyterhub
    jupyterhub
    binderhub
    binder
    team-compass
    mybinder.org-deploy
    configurable-http-proxy
    nativeauthenticator
    traefik-proxy
    jupyter-server-proxy


## Munge

In [13]:
repos = list_data()
repos

Unnamed: 0_level_0,created_at,2015,2016,2017,2018,2019
org,repo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
jupyter,nbconvert,10.0,1665.0,1105.0,799.0,87.0
jupyter,notebook,16.0,5495.0,2741.0,,
jupyter,repo2docker,,,585.0,1590.0,172.0
jupyterhub,binder,,,289.0,279.0,12.0
jupyterhub,binderhub,,,1310.0,1438.0,91.0
jupyterhub,configurable-http-proxy,,125.0,109.0,66.0,6.0
jupyterhub,jupyter-server-proxy,,,9.0,150.0,101.0
jupyterhub,jupyterhub,,2095.0,2223.0,2383.0,182.0
jupyterhub,mybinder.org-deploy,,,408.0,987.0,30.0
jupyterhub,nativeauthenticator,,,,17.0,42.0


In [73]:
i_opened_issues

Unnamed: 0,org,repo,date,user
12,jupyterhub,mybinder.org-deploy,2019-01-24 08:37:16,minrk
14,jupyterhub,mybinder.org-deploy,2019-01-22 13:28:43,betatim
5,jupyterhub,mybinder.org-deploy,2019-02-04 16:19:03,choldgraf


In [83]:
pd.merge(icomments, icommits, on=['user', 'date'], how='outer')

Unnamed: 0,user,date,comment,commit
0,minrk,2015-03-17 16:53:25,1.0,
1,minrk,2015-03-17 18:45:43,1.0,
2,Carreau,2015-11-12 21:43:36,1.0,
3,minrk,2016-01-21 16:24:36,1.0,
4,jdfreder,2016-02-25 02:02:42,1.0,
5,jdfreder,2016-02-25 04:39:56,1.0,
6,ellisonbg,2016-02-25 04:48:47,1.0,
7,sunlizhe,2016-02-25 07:31:54,1.0,
8,captainsafia,2016-02-25 08:06:40,1.0,
9,captainsafia,2016-02-25 08:41:24,1.0,


In [80]:
data = pd.DataFrame([])

for org, repo in repos.index:
    print('{}/{}'.format(org, repo))
    # Comments
    icomments = comments_.load_comments(org, repo)
    if icomments is not None:
        icomments['user'] = icomments['user'].map(lambda a: a['login'])
        icomments['date'] = icomments['created_at'].map(pd.to_datetime)
        icomments = icomments[['user', 'date']]
        icomments['comment'] = 1
        icomments = icomments.groupby(['user']).resample('D', on='date').sum().reset_index()
        
    # Commits
    icommits = commits_.load_commits(org, repo)
    if icommits is not None:
        icommits = icommits.dropna(subset=['author'])
        icommits['user'] = icommits['author'].map(lambda a: a['login'])
        icommits['date'] = pd.to_datetime(icommits['date'])
        icommits = icommits[['date', 'user']]
        icommits['commit'] = 1
        icommits = icommits.groupby(['user']).resample('D', on='date').sum().reset_index()
    
    # Issues
    issues = issues_.load_issues(org, repo)
    if issues is not None:
        issues['user'] = issues['user'].map(lambda a: a['login'])
        
        # Extract PRs vs Issues
        prs = issues[issues['pull_request'].values != None]
        issues = issues[issues['pull_request'].values == None]

        # Calculate the opened and losed issues
        i_opened_issues = issues.query('created_at > @date')
        i_closed_issues = issues.query('closed_at > @date')

        i_opened_issues = i_opened_issues.rename(columns={'created_at': 'date'})
        i_closed_issues = i_closed_issues.rename(columns={'closed_at': 'date'})

        # Calculate the opened and closed PRs
        i_opened_prs = prs.query('created_at > @date').rename(columns={'created_at': 'date'})
        i_closed_prs = prs.query('closed_at > @date').rename(columns={'closed_at': 'date'})

        # Pull only the fields we want and add identifying column
        i_opened_prs = i_opened_prs[['date', 'user']]
        i_opened_prs['opened_pr'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_closed_prs = i_closed_prs[['date', 'user']]
        i_closed_prs['closed_pr'] = 1
        i_closed_prs = i_closed_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_opened_issues = i_opened_issues[['date', 'user']]
        i_opened_prs['opened_issue'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_closed_issues = i_closed_issues[['date', 'user']]
        i_opened_prs['closed_issue'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()

jupyter/nbconvert
jupyter/notebook


KeyboardInterrupt: 

## Convert into daily and combine

In [42]:
# Convert into daily comments/commits to reduce size
daily_comments = comments.groupby(['org', 'repo', 'user']).resample('D').sum().dropna()
daily_comments = daily_comments.reset_index().query('count != 0')
daily_comments = daily_comments.rename(columns={'count': 'comments'})

In [43]:
# Daily commits
daily_commits = commits.groupby(['org', 'repo', 'user']).resample('D').sum().dropna()
daily_commits = daily_commits.reset_index().query('count != 0')
daily_commits = daily_commits.rename(columns={'count': 'commits'})

In [44]:
# Merge them and save
daily = pd.merge(daily_commits, daily_comments, on=['org', 'repo', 'user', 'date'], how='outer')
daily.to_csv('../data/daily.csv')