# Download commit info from a list of github organizations

In [1]:
import github3
import pandas as pd
from collections import deque

In [None]:
# go here to get a github OAuth token --> https://github.com/settings/tokens

In [2]:
gh = github3.login(token='xxxxxxxxx')

In [3]:
def fetch_commits_for_user(user, blacklisted_repos=None):
    # repos that I don't care about are listed in 'blacklisted_repos'
    if not blacklisted_repos:
        blacklisted_repos = ['homebrew',]
    all_repos = list(gh.repositories_by(user))
    repos = deque()
    authors = deque()
    messages = deque()
    dates = deque()
    committers = deque()
    shas = deque()
#     all_repos = nsls2_repos + Nikea_repos + skxray_repos + beamline_repos + flattened
    for idx, repo in enumerate(all_repos):
        # make all repo names lowercase
        repo_name = repo.full_name.lower()
        if any([blacklisted in repo_name for blacklisted in blacklisted_repos]):
            # don't process this blacklisted repo
            print('skipping %s. It is blacklisted. %s of %s' % (repo, idx+1, len(all_repos)))
            continue
        print('processing %s. %s of %s' % (repo, idx+1, len(all_repos)))

        try:
            commits = list(repo.commits())
        except github3.exceptions.ClientError:
            continue
        for commit in commits:
            repos.append(repo_name)
            committer = commit.commit.committer['name']
            committers.append(committer)
            try:
                authors.append(commit.author.login)
            except AttributeError:
                # there is no reported author of this commit.
                # use the name of the committer instead.
                authors.append(committer)
    #             print('commit %s from repo %s has no author' % (commit.sha, repo_name))
            messages.append(commit.commit.message)
    #         print(commit.commit.committer/)
            dates.append(commit.commit.committer['date'])
            shas.append(commit.sha)
    return {
        'repo': repos,
        'authors': authors,
        'messages': messages,
        'dates': dates,
        'committers': committers,
        'shas': shas,
    }

In [4]:
softmatter_commits = fetch_commits_for_user('soft-matter')

processing soft-matter/mr. 1 of 8
processing soft-matter/pims. 2 of 8
processing soft-matter/pims_nd2. 3 of 8
processing soft-matter/slicerator. 4 of 8
processing soft-matter/sm_core. 5 of 8
processing soft-matter/trackpy. 6 of 8
processing soft-matter/trackpy-examples. 7 of 8
processing soft-matter/yaml-serialize. 8 of 8


In [5]:
matplotlib_commits = fetch_commits_for_user('matplotlib')

processing matplotlib/basemap. 1 of 15
processing matplotlib/cmocean. 2 of 15
processing matplotlib/cycler. 3 of 15
processing matplotlib/devdocs. 4 of 15
processing matplotlib/freetypy. 5 of 15
processing matplotlib/matplotlib. 6 of 15
processing matplotlib/matplotlib-jenkins. 7 of 15
processing matplotlib/matplotlib.github.com. 8 of 15
processing matplotlib/mplsizer. 9 of 15
processing matplotlib/mpl_mac_testing. 10 of 15
processing matplotlib/natgrid. 11 of 15
processing matplotlib/sampledoc. 12 of 15
processing matplotlib/sample_data. 13 of 15
processing matplotlib/trendvis. 14 of 15
processing matplotlib/viscm. 15 of 15


In [6]:
nsls2_commits = fetch_commits_for_user('NSLS-II')

processing NSLS-II/album. 1 of 26
processing NSLS-II/bluesky. 2 of 26
processing NSLS-II/brokerStreamServer. 3 of 26
processing NSLS-II/Bug-Reports. 4 of 26
processing NSLS-II/carchivetools. 5 of 26
processing NSLS-II/channelarchiver. 6 of 26
processing NSLS-II/conda-prescriptions. 7 of 26
processing NSLS-II/configs. 8 of 26
processing NSLS-II/databroker. 9 of 26
processing NSLS-II/datamuxer. 10 of 26
processing NSLS-II/dataportal. 11 of 26
processing NSLS-II/docs. 12 of 26
processing NSLS-II/EXLog. 13 of 26
processing NSLS-II/filestore. 14 of 26
processing NSLS-II/lz4-plugin. 15 of 26
processing NSLS-II/metadatastore. 16 of 26
processing NSLS-II/NSLS-II.github.io. 17 of 26
processing NSLS-II/ophyd. 18 of 26
processing NSLS-II/ophyd-examples. 19 of 26
processing NSLS-II/pyepics. 20 of 26
processing NSLS-II/pyOlog. 21 of 26
processing NSLS-II/replay. 22 of 26
processing NSLS-II/sampleManager. 23 of 26
processing NSLS-II/suitcase. 24 of 26
processing NSLS-II/vertical-integration. 25 of 2

In [7]:
skxray_commits = fetch_commits_for_user('scikit-xray')

processing scikit-xray/scikit-xray. 1 of 5
processing scikit-xray/scikit-xray-bench. 2 of 5
processing scikit-xray/scikit-xray-examples. 3 of 5
processing scikit-xray/scikit-xray-feedstock. 4 of 5
processing scikit-xray/scikit-xray-feedstock-dev. 5 of 5


In [8]:
Nikea_commits = fetch_commits_for_user('Nikea')

processing Nikea/benchmark_scripts. 1 of 8
processing Nikea/history. 2 of 8
processing Nikea/nikea.github.io. 3 of 8
processing Nikea/nsls2_gui. 4 of 8
processing Nikea/pyXPCS. 5 of 8
processing Nikea/VisTrails. 6 of 8
processing Nikea/VTTools. 7 of 8
processing Nikea/xray-vision. 8 of 8


In [9]:
csx_commits = fetch_commits_for_user('NSLS-II-CSX')
chx_commits = fetch_commits_for_user('NSLS-II-CHX')
hxn_commits = fetch_commits_for_user('NSLS-II-HXN')
srx_commits = fetch_commits_for_user('NSLS-II-SRX')
xpd_commits = fetch_commits_for_user('NSLS-II-XPD')
ixs_commits = fetch_commits_for_user('NSLS-II-IXS')

processing NSLS-II-CSX/acopian. 1 of 27
processing NSLS-II-CSX/baffleslits. 2 of 27
processing NSLS-II-CSX/Bug-Reports. 3 of 27
processing NSLS-II-CSX/CAAutoConfig. 4 of 27
processing NSLS-II-CSX/CableDatabase. 5 of 27
processing NSLS-II-CSX/calibration. 6 of 27
processing NSLS-II-CSX/conda-recipes. 7 of 27
processing NSLS-II-CSX/csxtools. 8 of 27
processing NSLS-II-CSX/FastCCDConfig. 9 of 27
processing NSLS-II-CSX/ipython_ophyd. 10 of 27
processing NSLS-II-CSX/lakeshore336. 11 of 27
processing NSLS-II-CSX/libcin. 12 of 27
processing NSLS-II-CSX/logbook. 13 of 27
processing NSLS-II-CSX/lupit. 14 of 27
processing NSLS-II-CSX/NSLS-II-CSX.github.io. 15 of 27
processing NSLS-II-CSX/nsls2NotifyMe. 16 of 27
processing NSLS-II-CSX/omegaM4061. 17 of 27
processing NSLS-II-CSX/ophyd. 18 of 27
processing NSLS-II-CSX/pyepics. 19 of 27
processing NSLS-II-CSX/pyVLSPGM. 20 of 27
processing NSLS-II-CSX/qcinview. 21 of 27
processing NSLS-II-CSX/SRW. 22 of 27
processing NSLS-II-CSX/stanfordDG645. 23 of 

In [10]:
vistrails_commits = fetch_commits_for_usertch_commits_for_user('VisTrails')

processing VisTrails/DAT. 1 of 2
processing VisTrails/VisTrails. 2 of 2


In [13]:
ericdill_commits = fetch_commits_for_user('ericdill')

processing ericdill/album. 1 of 83
processing ericdill/anaconda-build. 2 of 83
processing ericdill/archiver. 3 of 83
processing ericdill/asv. 4 of 83
processing ericdill/auto-enaml. 5 of 83
processing ericdill/benchmark_scripts. 6 of 83
processing ericdill/bluesky. 7 of 83
processing ericdill/bokeh. 8 of 83
processing ericdill/boltons. 9 of 83
processing ericdill/brokerStreamServer. 10 of 83
processing ericdill/chxtools. 11 of 83
processing ericdill/committery. 12 of 83
processing ericdill/CompStats. 13 of 83
processing ericdill/conda-builder. 14 of 83
processing ericdill/conda-prescriptions. 15 of 83
processing ericdill/conda-recipes. 16 of 83
processing ericdill/conda-smithy. 17 of 83
processing ericdill/controlsui. 18 of 83
processing ericdill/csxtools. 19 of 83
processing ericdill/databroker. 20 of 83
processing ericdill/datamuxer. 21 of 83
processing ericdill/demoCSX. 22 of 83
processing ericdill/diffpy.srfit. 23 of 83
processing ericdill/diffpy.srxplanar. 24 of 83
processing eric

In [14]:
danielballan_commits = fetch_commits_for_user('danielballan')

processing danielballan/asv. 1 of 94
processing danielballan/auto-enaml. 2 of 94
processing danielballan/banyan. 3 of 94
processing danielballan/blog. 4 of 94
processing danielballan/bluesky. 5 of 94
processing danielballan/bokeh. 6 of 94
processing danielballan/channelarchiver. 7 of 94
processing danielballan/conda-build-missing. 8 of 94
processing danielballan/conda-docs. 9 of 94
processing danielballan/conda-kernels. 10 of 94
processing danielballan/conda-prescriptions. 11 of 94
processing danielballan/conda-recipes. 12 of 94
processing danielballan/conda-smithy. 13 of 94
processing danielballan/counterpoint. 14 of 94
processing danielballan/cycler. 15 of 94
processing danielballan/datamuxer. 16 of 94
processing danielballan/dataportal. 17 of 94
processing danielballan/diffpy.srxplanar. 18 of 94
processing danielballan/docker-demo-images. 19 of 94
processing danielballan/docs. 20 of 94
processing danielballan/Event-Organiser. 21 of 94
processing danielballan/fetch. 22 of 94
processi

In [15]:
dchabot_commits = fetch_commits_for_user('dchabot')
arkilic_commits = fetch_commits_for_user('arkilic')
cowanml_commits = fetch_commits_for_user('cowanml')
areaDetector_commits = fetch_commits_for_user('areaDetector')

processing dchabot/adsim. 1 of 15
processing dchabot/areadetector-1-9-1. 2 of 15
processing dchabot/bluesky. 3 of 15
processing dchabot/caautoconfig. 4 of 15
processing dchabot/cls-orbitcontrol. 5 of 15
processing dchabot/diffcalc. 6 of 15
processing dchabot/dotfiles. 7 of 15
processing dchabot/hkl. 8 of 15
processing dchabot/motor-synapps. 9 of 15
processing dchabot/motorsim. 10 of 15
processing dchabot/ophyd. 11 of 15
processing dchabot/pyepics. 12 of 15
processing dchabot/python-pcaspy. 13 of 15
processing dchabot/quadem. 14 of 15
processing dchabot/synapps-mca. 15 of 15
processing arkilic/filestore. 1 of 5
processing arkilic/mdsbenchmark. 2 of 5
processing arkilic/metadatastore. 3 of 5
processing arkilic/pvaPy. 4 of 5
processing arkilic/v4table_example. 5 of 5
processing cowanml/cookiecutter-pylibrary. 1 of 6
processing cowanml/lsdc. 2 of 6
processing cowanml/metadatastore. 3 of 6
processing cowanml/samplemanager. 4 of 6
processing cowanml/samplemangler. 5 of 6
processing cowanml/s

In [28]:
synchbot_commits = fetch_commits_for_user('synchbot')

processing synchbot/metadataclientv4. 1 of 3
processing synchbot/metadataservice. 2 of 3
processing synchbot/metadatastorev4. 3 of 3


In [41]:
klauer_commits = fetch_commits_for_user('klauer')

processing klauer/ADMerlin. 1 of 31
processing klauer/anc300. 2 of 31
processing klauer/atr142. 3 of 31
processing klauer/bluesky. 4 of 31
processing klauer/build_opi. 5 of 31
processing klauer/conda-prescriptions. 6 of 31
processing klauer/ddrive. 7 of 31
processing klauer/dotfiles. 8 of 31
processing klauer/ECLI. 9 of 31
processing klauer/EZ4axis. 10 of 31
processing klauer/f460. 11 of 31
processing klauer/hkl. 12 of 31
processing klauer/ipplan. 13 of 31
processing klauer/mmc100. 14 of 31
processing klauer/ophyd. 15 of 31
processing klauer/pmcv. 16 of 31
processing klauer/PMD101. 17 of 31
processing klauer/pmd90. 18 of 31
processing klauer/ppmac. 19 of 31
processing klauer/PVRename. 20 of 31
processing klauer/pyepics. 21 of 31
processing klauer/pympx. 22 of 31
processing klauer/pyosxdict. 23 of 31
processing klauer/pyzygo. 24 of 31
processing klauer/qolibri. 25 of 31
processing klauer/simple_scaler. 26 of 31
processing klauer/sios. 27 of 31
processing klauer/smarpod. 28 of 31
process

In [42]:
commit_order = [
    skxray_commits,
    softmatter_commits,
    vistrails_commits,
    matplotlib_commits,
    areaDetector_commits,
    nsls2_commits,
    Nikea_commits,
    chx_commits,
    csx_commits,
    hxn_commits,
    srx_commits,
    ixs_commits,
    xpd_commits,
    synchbot_commits,
    ericdill_commits,
    danielballan_commits,
    dchabot_commits,
    arkilic_commits,
    cowanml_commits,
    klauer_commits,
]

In [30]:
from collections import defaultdict

In [43]:
df = defaultdict(deque)
for commits in commit_order:
    for column_name, column in commits.items():
        df[column_name].extend(column)
df = pd.DataFrame(df)
    
# dfs = {repo_name: pd.DataFrame({column_name: pd.Series(column) for column_name, column in repo_data.items()}) 
#        for repo_name, repo_data in repo_info.items()}

In [44]:
len(df)

291595

In [45]:
cleaned_df = df.drop_duplicates('shas').copy()

In [46]:
len(cleaned_df)

182540

In [47]:
# # remove the user name where the repo came from
# repo_names = [repo.split('/')[-1] for repo in cleaned_df.repo]
# cleaned_df.update({'repo': pd.Series(repo_names)})

In [48]:
map_people = {
    'daniel allan': 'danielballan',
    'arman arkilic': 'arkilic',
    'daron chabot': 'dchabot',
    'thomas caswell': 'tacaswell',
}

In [49]:
from collections import deque
authors = deque()
for idx, (author, committer) in enumerate(zip(cleaned_df.authors, cleaned_df.committers)):
    if author == 'unknown':
        author = committer
    author = author.lower()
    if author in map_people:
        author = map_people[author]
    authors.append(str(author).lower())
cleaned_df['authors'] = authors

In [50]:
cleaned_df.to_csv('cleaned-commit-info.csv')