In [10]:
import pandas as pd
from subprocess import run, PIPE
from watchtower import comments_, issues_, commits_, reviews_
from glob import glob
import os.path as op
import shutil as sh
import os
import numpy as np

import sys
sys.path.append('../scripts/')
from mod import list_data

auth = os.environ['GITHUB_API_TOKEN']

## Projects we track

In [31]:
projects = {"jupyter": ["nbconvert", "jupyter-book", "notebook", "repo2docker"],
            "jupyterhub": ["binder", "binderhub", "configurable-http-proxy", "jupyter-server-proxy", "jupyterhub",
                           "mybinder.org-deploy", "nativeauthenticator", "team-compass", "the-littlest-jupyterhub", "traefik-proxy", "zero-to-jupyterhub-k8s"]
}

## Collect and save team membership info

In [135]:
from ruamel.yaml import YAML
from requests import get

url_jupyterhub = "https://raw.githubusercontent.com/jupyterhub/team-compass/master/docs/team/contributors-jupyterhub.yaml"
url_binder = "https://raw.githubusercontent.com/jupyterhub/team-compass/master/docs/team/contributors-binder.yaml"

yaml = YAML()
jupyterhub_team = yaml.load(get(url_jupyterhub).text)
binder_team = yaml.load(get(url_binder).text)

In [136]:
data = []
for ii in jupyterhub_team:
    data.append({'name': ii['name'], 'username': ii['handle'].strip('@'), 'team': 'jupyterhub'})
    
for ii in binder_team:
    data.append({'name': ii['name'], 'username': ii['handle'].strip('@'), 'team': ii['team']})

In [137]:
teams = pd.DataFrame(data)
teams.to_csv('../data/teams.csv', index=False)

## Helper funcs

In [3]:
data_folder = "/home/choldgraf/watchtower_data/"
def delete_data(org, repo, data_dir=data_folder):
    path_data = op.join(op.expanduser(data_dir), org, repo)
    if op.exists(path_data):
        sh.rmtree(path_data)
        print('Deleted {}/{}'.format(org, repo))
        
def list_data(data_dir=data_folder):
    all_data = []
    for folder in glob(op.join(op.expanduser(data_dir), '*', '*')):
        org, repo = folder.split(op.expanduser(data_dir))[-1].strip(os.sep).split(os.sep)[:2]
        data = comments_.load_comments(org, repo)
        date_col = 'created_at' if 'created_at' in data.columns else 'date'
        data = data[[date_col]].resample('Y', on=date_col).count()
        data['org'] = org
        data['repo'] = repo
        all_data.append(data.rename(columns={'created_at': 'count'}))
    all_data = pd.concat(all_data).set_index(['org', 'repo'], append=True)['count'].unstack('created_at')
    all_data.columns = all_data.columns.year
    return all_data

## Update data

In [4]:
from datetime import timedelta
date = pd.datetime.today() - timedelta(days=30)

In [12]:
data = list_data()
projects = data.index.values

In [26]:
# If we wish to update the data
update_data = True
if update_data is True:
    try:
        for org, repos in projects.items():
            for repo in repos:
            print(org + '/' + repo)
            run("python ../scripts/update_data.py {} {} --date {:%Y-%m-%d}".format(org, repo, date).split(), check=True, stderr=PIPE, stdout=PIPE)
    except Exception as ee:
        print(ee)
        print(ee.stderr.decode())   

jupyter/nbconvert
jupyter/notebook
jupyter/repo2docker
jupyterhub/binder
jupyterhub/binderhub
jupyterhub/configurable-http-proxy
jupyterhub/jupyter-server-proxy
jupyterhub/jupyterhub
jupyterhub/mybinder.org-deploy
jupyterhub/nativeauthenticator
jupyterhub/team-compass
jupyterhub/the-littlest-jupyterhub
jupyterhub/traefik-proxy
jupyterhub/zero-to-jupyterhub-k8s
nteract/hydrogen
nteract/nteract
nteract/papermill


## Munge

In [32]:
repos = list_data()
repos

Unnamed: 0_level_0,created_at,2018,2019
org,repo,Unnamed: 2_level_1,Unnamed: 3_level_1
jupyter,nbconvert,,41.0
jupyter,notebook,,411.0
jupyter,repo2docker,,176.0
jupyterhub,binder,,39.0
jupyterhub,binderhub,,68.0
jupyterhub,configurable-http-proxy,,1.0
jupyterhub,jupyter-server-proxy,,17.0
jupyterhub,jupyterhub,,122.0
jupyterhub,mybinder.org-deploy,,47.0
jupyterhub,nativeauthenticator,,18.0


In [61]:
data = pd.DataFrame([])

for org, repo in repos.index:
    print('{}/{}'.format(org, repo))
    # Comments
    icomments = comments_.load_comments(org, repo)
    if icomments is not None:
        icomments['user'] = icomments['user'].map(lambda a: a['login'])
        icomments['date'] = icomments['created_at'].map(pd.to_datetime)
        icomments = icomments[['user', 'date']]
        icomments['comment'] = 1
        icomments = icomments.groupby(['user']).resample('D', on='date').sum().reset_index()
        
    # Commits
    icommits = commits_.load_commits(org, repo)
    if icommits is not None:
        icommits = icommits.dropna(subset=['author'])
        icommits['user'] = icommits['author'].map(lambda a: a['login'])
        icommits['date'] = pd.to_datetime(icommits['date'])
        icommits = icommits[['date', 'user']]
        icommits['commit'] = 1
        icommits = icommits.groupby(['user']).resample('D', on='date').sum().reset_index()
    
    # Issues
    issues = issues_.load_issues(org, repo)
    if issues is not None:
        issues['user'] = issues['user'].map(lambda a: a['login'])
        
        # Extract PRs vs Issues
        prs = issues[issues['pull_request'].values != None]
        issues = issues[issues['pull_request'].values == None]

        # Calculate the opened and losed issues
        i_opened_issues = issues.query('created_at > @date')
        i_closed_issues = issues.query('closed_at > @date')

        i_opened_issues = i_opened_issues.rename(columns={'created_at': 'date'})
        i_closed_issues = i_closed_issues.rename(columns={'closed_at': 'date'})

        # Calculate the opened and closed PRs
        i_opened_prs = prs.query('created_at > @date').rename(columns={'created_at': 'date'})
        i_closed_prs = prs.query('closed_at > @date').rename(columns={'closed_at': 'date'})

        # Pull only the fields we want and add identifying column
        i_opened_prs = i_opened_prs[['date', 'user']]
        i_opened_prs['opened_pr'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_closed_prs = i_closed_prs[['date', 'user']]
        i_closed_prs['closed_pr'] = 1
        i_closed_prs = i_closed_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_opened_issues = i_opened_issues[['date', 'user']]
        i_opened_prs['opened_issue'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
        
        i_closed_issues = i_closed_issues[['date', 'user']]
        i_opened_prs['closed_issue'] = 1
        i_opened_prs = i_opened_prs.groupby(['user']).resample('D', on='date').sum().reset_index()
     
    # Add to our daily tally
    merge_on = ["user", "date"]
    idaily = icomments
    for idata in [icommits, i_opened_prs, i_closed_prs, i_opened_issues, i_closed_issues]:
        idaily = idaily.merge(idata, on=merge_on, how='outer')
    idaily['org'] = org
    idaily['repo'] = repo
    idaily = idaily.replace(np.nan, 0)
    data = data.append(idaily)

jupyter/nbconvert
jupyter/notebook
jupyter/repo2docker
jupyterhub/binder
jupyterhub/binderhub
jupyterhub/configurable-http-proxy
jupyterhub/jupyter-server-proxy
jupyterhub/jupyterhub
jupyterhub/mybinder.org-deploy
jupyterhub/nativeauthenticator
jupyterhub/team-compass
jupyterhub/the-littlest-jupyterhub
jupyterhub/traefik-proxy
jupyterhub/zero-to-jupyterhub-k8s
nteract/hydrogen
nteract/nteract
nteract/papermill


In [63]:
data.to_csv('../data/daily.csv')

Unnamed: 0,user,date,comment,commit,opened_pr,opened_issue,closed_issue,closed_pr,org,repo
0,Juanlu001,2019-02-20 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
1,Juanlu001,2019-02-21 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
2,Juanlu001,2019-02-22 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
3,Juanlu001,2019-02-23 00:00:00,1.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
4,MSeal,2019-02-23 00:00:00,1.0,1.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
5,MSeal,2019-02-24 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
6,MSeal,2019-02-25 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
7,MSeal,2019-02-26 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
8,MSeal,2019-02-27 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert
9,MSeal,2019-02-28 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,jupyter,nbconvert


## Convert into daily and combine

In [35]:
# Convert into daily comments/commits to reduce size
daily_comments = comments.groupby(['org', 'repo', 'user']).resample('D').sum().dropna()
daily_comments = daily_comments.reset_index().query('count != 0')
daily_comments = daily_comments.rename(columns={'count': 'comments'})

NameError: name 'comments' is not defined

In [36]:
# Daily commits
daily_commits = commits.groupby(['org', 'repo', 'user']).resample('D').sum().dropna()
daily_commits = daily_commits.reset_index().query('count != 0')
daily_commits = daily_commits.rename(columns={'count': 'commits'})

NameError: name 'commits' is not defined

In [37]:
# Merge them and save
daily = pd.merge(daily_commits, daily_comments, on=['org', 'repo', 'user', 'date'], how='outer')
daily.to_csv('../data/daily.csv')

NameError: name 'daily_commits' is not defined