## Repo launches at mybinder.org

https://archive.analytics.mybinder.org/

In [1]:
import pandas as pd
from sqlite_utils import Database

In [2]:
db = Database("mybinder_archive.db")

In [3]:
# get table names
db.table_names()

['mybinderlaunch', 'repo']

In [4]:
launch_table = 'mybinderlaunch'

In [5]:
# db[launch_table].columns
db[launch_table].columns_dict

{'timestamp': str,
 'version': int,
 'origin': str,
 'provider': str,
 'spec': str,
 'org': str,
 'ref': str,
 'resolved_ref': str,
 'r2d_version': str,
 'repo_url': str,
 'repo_id': str}

In [6]:
# NOTE: this notebook wont have same output (nbs_same_output)
query = f'SELECT * FROM {launch_table} ORDER BY RANDOM() LIMIT 100'
pd.read_sql_query(query, db.conn)

Unnamed: 0,timestamp,version,origin,provider,spec,org,ref,resolved_ref,r2d_version,repo_url,repo_id
0,2020-04-03T22:33:00,3,gesis.mybinder.org,GitHub,ipython/ipython-in-depth/master,ipython,master,,jupyter/repo2docker:0.11.0-31.g1776b79,https://github.com/ipython/ipython-in-depth,1
1,2019-02-19T14:44:00,1,mybinder.org,GitHub,jupyterlab/jupyterlab-demo/master,jupyterlab,master,,jupyter/repo2docker:9766c954,https://github.com/jupyterlab/jupyterlab-demo,4
2,2019-09-11T13:44:00,3,gke.mybinder.org,GitHub,jupyterlab/jupyterlab-demo/try.jupyter.org,jupyterlab,try.jupyter.org,,jupyter/repo2docker:0.10.0-49.g57919b9,https://github.com/jupyterlab/jupyterlab-demo,4
3,2019-08-14T21:18:00,3,gke.mybinder.org,GitHub,jupyterlab/jupyterlab-demo/try.jupyter.org,jupyterlab,try.jupyter.org,,jupyter/repo2docker:9c559d2a,https://github.com/jupyterlab/jupyterlab-demo,4
4,2019-03-16T17:21:00,1,mybinder.org,GitHub,decodoku/A_Game_to_Benchmark_Quantum_Computers...,decodoku,master,,jupyter/repo2docker:9766c954,https://github.com/decodoku/a_game_to_benchmar...,826
...,...,...,...,...,...,...,...,...,...,...,...
95,2019-11-11T01:32:00,3,notebooks.gesis.org,GitHub,ipython/ipython-in-depth/master,ipython,master,,jupyter/repo2docker:0.10.0-112.g242107b,https://github.com/ipython/ipython-in-depth,1
96,2020-03-22T23:42:00,3,gke.mybinder.org,GitHub,jupyterlab/jupyterlab-demo/try.jupyter.org,jupyterlab,try.jupyter.org,,jupyter/repo2docker:0.11.0-24.gfce6488,https://github.com/jupyterlab/jupyterlab-demo,4
97,2019-09-12T05:48:00,3,gke.mybinder.org,GitHub,ipython/ipython-in-depth/master,ipython,master,,jupyter/repo2docker:0.10.0-57.ge33d5f8,https://github.com/ipython/ipython-in-depth,1
98,2020-07-23T19:54:00,4,gke.mybinder.org,GitHub,binder-examples/r/master,binder-examples,master,0893546b871b28f0f6b63aac23b3a36b73bb1bf1,jupyter/repo2docker:0.11.0-98.g8bbced7,https://github.com/binder-examples/r,17


### Total number of launches

In [7]:
db[launch_table].count

10237864

First timestamp

In [8]:
launches = db[launch_table].rows_where(order_by="timestamp")
first_launch_timestamp = next(launches)["timestamp"]
del launches

first_launch_timestamp

'2018-11-03T00:00:00'

Last timestamp

In [9]:
launches = db[launch_table].rows_where(order_by="timestamp desc")
last_launch_timestamp = next(launches)["timestamp"]
del launches

last_launch_timestamp

'2020-08-04T23:59:00'

### Launches per origin

In [10]:
query = f'SELECT origin, count(origin) FROM {launch_table} GROUP BY "origin";'
launches_per_origin = pd.read_sql_query(query, db.conn)
launches_per_origin

Unnamed: 0,origin,count(origin)
0,binder.mybinder.ovh,625
1,binder.mybinder.turing.ac.uk,137
2,gesis.mybinder.org,886503
3,gke.mybinder.org,5104005
4,gke.mybinder.org:443,1
5,mybinder.org,2741924
6,notebooks.gesis.org,671545
7,ovh.mybinder.org,573736
8,turing.mybinder.org,259388


In [11]:
launches_per_origin_dict = {"gke": 0, "gesis": 0, "turing": 0, "ovh": 0}
for index, row in launches_per_origin.iterrows():
    if row["origin"] == "mybinder.org":
        launches_per_origin_dict["gke"] += row["count(origin)"]
    for origin in launches_per_origin_dict.keys():
        if origin in row["origin"]:
            # print(origin, row["origin"])
            launches_per_origin_dict[origin] += row["count(origin)"]

assert sum(launches_per_origin_dict.values()) == db[launch_table].count

pd.DataFrame.from_dict(
    launches_per_origin_dict, 
    orient="index", 
    columns=["launches"]
).sort_values(by=["launches"], ascending=False)

Unnamed: 0,launches
gke,7845930
gesis,1558048
ovh,574361
turing,259525


### Launches per provider

In [12]:
query = f'SELECT provider, count(provider) FROM {launch_table} GROUP BY "provider";'
launches_per_provider = pd.read_sql_query(query, db.conn).sort_values(by=["count(provider)"], ascending=False)

launches_per_provider

Unnamed: 0,provider,count(provider)
4,GitHub,9969623
2,Gist,152372
3,Git,71762
5,GitLab,42220
7,Zenodo,1043
6,Hydroshare,396
1,Figshare,313
0,Dataverse,135


In [13]:
# percentage of first 2 providers
(sum(launches_per_provider.head(2)["count(provider)"])*100)/sum(launches_per_provider["count(provider)"])

98.8682307168761

### Repos per provider

In [14]:
query = f'SELECT provider, count(DISTINCT repo_url) FROM {launch_table} GROUP BY "provider";'
repos_per_provider = pd.read_sql_query(query, db.conn).\
                        sort_values(by=["count(DISTINCT repo_url)"], ascending=False)

repos_per_provider

Unnamed: 0,provider,count(DISTINCT repo_url)
4,GitHub,51757
3,Git,10696
2,Gist,1614
5,GitLab,944
7,Zenodo,92
6,Hydroshare,39
1,Figshare,31
0,Dataverse,17


In [15]:
# # this returns 197 repos for Git provider because they have github url but Git as provider, 
# # which is okay and should work 
# query = f'SELECT provider, count(DISTINCT repo_id) FROM {launch_table} GROUP BY "provider";'
# repos_per_provider = pd.read_sql_query(query, db.conn).\
#                         sort_values(by=["count(DISTINCT repo_id)"], ascending=False)

# repos_per_provider

### Organisations/Users in repos

In [16]:
# number of different orgs
query = "select count(distinct org) from mybinderlaunch;"
db.conn.execute(query).fetchone()[0]

35403