In [1]:
import pandas as pd
from sqlite_utils import Database

In [2]:
db = Database("mybinder_archive.db")

In [3]:
# get table names
db.table_names()

['mybinderlaunch', 'repo']

In [4]:
repo_table = 'repo'

In [5]:
# db[repo_table].columns
db[repo_table].columns_dict

{'id': int,
 'repo_url': str,
 'provider': str,
 'launch_count': int,
 'first_launch': str,
 'last_launch': str,
 'last_spec': str,
 'refs': str,
 'resolved_refs': str,
 'remote_id': str,
 'fork': str,
 'dockerfile': int,
 'resolved_ref_now': str,
 'image_name': str,
 'renamed': int}

In [6]:
query = f'SELECT * FROM {repo_table} ORDER BY first_launch;'
repos = pd.read_sql_query(query, db.conn)
repos = repos[
    ["id", "remote_id", "repo_url", "provider", 
     "last_spec", "launch_count", "resolved_ref_now", 
     "image_name", "fork", "dockerfile","renamed"]]

In [7]:
# def make_clickable(val):
#     return '<a target="_blank" href="{}">{}</a>'.format(val, val)

In [8]:
# repos.style.format({'repo_url': make_clickable})

In [9]:
repos.head()

Unnamed: 0,id,remote_id,repo_url,provider,last_spec,launch_count,resolved_ref_now,image_name,fork,dockerfile,renamed
0,1,3641953.0,https://github.com/ipython/ipython-in-depth,GitHub,ipython/ipython-in-depth/master,3581285,7e5ce96cc9251083979efdfc393425f1229a4a68,bp20-ipython-2dipython-2din-2ddepth-65852f:7e5...,0,0.0,0.0
1,2,99244384.0,https://github.com/qiskit/qiskit-tutorial,GitHub,QISKit/qiskit-tutorial/master,22333,e87c14c4303fc9cd9ab06727c9c49845864deb58,bp20-qiskit-2dqiskit-2dtutorial-d06a1e:e87c14c...,0,0.0,1.0
2,3,,https://github.com/eniocsj/dados,GitHub,eniocsj/dados/master,42,404,,404,,
3,4,69698350.0,https://github.com/jupyterlab/jupyterlab-demo,GitHub,jupyterlab/jupyterlab-demo/try.jupyter.org,1680558,71b616307dcb9cba18262d8e5fa46ad828ca51e4,bp20-jupyterlab-2djupyterlab-2ddemo-b2f8d0:71b...,0,0.0,0.0
4,5,21578345.0,https://github.com/bokeh/bokeh-notebooks,GitHub,bokeh/bokeh-notebooks/master,146801,7c697252c89805390d14f04eb0468162d2a65f24,bp20-bokeh-2dbokeh-2dnotebooks-6dc098:7c697252...,0,0.0,0.0


### Understanding repos table

#### Resolved refs

`create_repo_table.py` fetches resolved ref of the last spec at the time that the script is running.
`resolved_ref_now=404` means that the specs is not valid anymore.

In [10]:
repos[["repo_url", "provider", "last_spec", "resolved_ref_now", "fork"]]

Unnamed: 0,repo_url,provider,last_spec,resolved_ref_now,fork
0,https://github.com/ipython/ipython-in-depth,GitHub,ipython/ipython-in-depth/master,7e5ce96cc9251083979efdfc393425f1229a4a68,0
1,https://github.com/qiskit/qiskit-tutorial,GitHub,QISKit/qiskit-tutorial/master,e87c14c4303fc9cd9ab06727c9c49845864deb58,0
2,https://github.com/eniocsj/dados,GitHub,eniocsj/dados/master,404,404
3,https://github.com/jupyterlab/jupyterlab-demo,GitHub,jupyterlab/jupyterlab-demo/try.jupyter.org,71b616307dcb9cba18262d8e5fa46ad828ca51e4,0
4,https://github.com/bokeh/bokeh-notebooks,GitHub,bokeh/bokeh-notebooks/master,7c697252c89805390d14f04eb0468162d2a65f24,0
...,...,...,...,...,...
50260,https://github.com/zap1295/my-first-binder,GitHub,zap1295/my-first-binder/master,fb8d541160424ba26e1bd1180dd9dbbde39a850c,0
50261,https://github.com/mgill25/jupyter-playground,GitHub,mgill25/jupyter-playground/master,dabd67884a845e202b689ca695597a156b996833,0
50262,https://github.com/secregister01/datasciencemi...,GitHub,secregister01/DataScienceMiniProject.github.io...,404,404
50263,https://github.com/elewah/2d-3d-radviz-lib,GitHub,elewah/2D-3D-RadViz-lib/master,404,404


In [11]:
repos[repos["resolved_ref_now"].isnull()][["repo_url", "provider", "last_spec", "resolved_ref_now", "fork"]]

Unnamed: 0,repo_url,provider,last_spec,resolved_ref_now,fork


In [12]:
repos[repos["resolved_ref_now"] == "404"][["repo_url", "provider", "last_spec", "resolved_ref_now", "fork"]]

Unnamed: 0,repo_url,provider,last_spec,resolved_ref_now,fork
2,https://github.com/eniocsj/dados,GitHub,eniocsj/dados/master,404,404
15,https://github.com/udm88/home_work_2,GitHub,udm88/home_work_2/master,404,404
17,https://github.com/liviu-anita/calcul-numeric,GitHub,liviu-anita/Calcul-Numeric/master,404,404
38,https://github.com/bengal-tiger/youcandothermo...,GitHub,BENGAL-TIGER/YOUcanDoThermodynamics/master,404,404
39,https://github.com/marlohaering/se1-notebook,GitHub,marlohaering/SE1-Notebook/master,404,404
...,...,...,...,...,...
50243,https://github.com/hwhk/test,GitHub,hwhk/test/master,404,404
50249,https://github.com/hwhk/temtations,GitHub,hwhk/Temtations/master,404,404
50262,https://github.com/secregister01/datasciencemi...,GitHub,secregister01/DataScienceMiniProject.github.io...,404,404
50263,https://github.com/elewah/2d-3d-radviz-lib,GitHub,elewah/2D-3D-RadViz-lib/master,404,404


In [13]:
len(repos[repos["resolved_ref_now"] == "404"])

6027

if `resolved_ref_now` is not available, `image_name` is not available neither

In [14]:
len(repos[repos["image_name"].isnull()])

6032

In [15]:
# select resolved_ref_now from repo where image_name is null group by resolved_ref_now;
repos[repos["image_name"].isnull()].groupby("resolved_ref_now", as_index = False).size()

resolved_ref_now
404    6027
409       3
451       2
dtype: int64

There are repos that have same resolved ref but have different remote ids.

One reason could be that a user creates a new repo with history of another repo (without using github forking)

In [16]:
query = f'select fork, renamed, count(resolved_ref_now) as rrn_count, resolved_ref_now, '\
                 f'GROUP_CONCAT(DISTINCT repo_url) AS repo_urls '\
        f'from {repo_table} '\
        f'where fork=0 AND renamed=0 '\
        f'group by "resolved_ref_now" '\
        f'having rrn_count>1;'
pd.read_sql_query(query, db.conn)
# query = 'select fork, renamed, count(resolved_ref_now) as rrn_count, resolved_ref_now, GROUP_CONCAT(DISTINCT repo_url) AS repo_urls from repo where fork!=1 AND renamed!=1 group by "resolved_ref_now" having rrn_count>1;'
# pd.read_sql_query(query, db.conn)

Unnamed: 0,fork,renamed,rrn_count,resolved_ref_now,repo_urls
0,0,0,2,051d6dc57f10b605485bea0c3565130eaf064ec8,"https://github.com/data-8/materials-su18,https..."
1,0,0,6,20095e168f0d76da5787f914684cc3b6a58ce660,https://github.com/wshuyi/demo-spacy-text-proc...
2,0,0,2,2d2668d3b9a54546c681bc27efbbc9b326af1ab1,https://github.com/deep-diver/smartwork-with-p...
3,0,0,4,36bc4827ea1b53afd099e79a91d73614a6116bfe,"https://github.com/eminentspade/codingai-pub,h..."
4,0,0,2,3a95d118f9df5a86826e1791c5c100817f0fd924,https://github.com/udacity/deep-learning-v2-py...
5,0,0,2,3d0c70ddaa5c63606a043aaca35b64f4272651e0,"https://github.com/jason2249/section-8-a,https..."
6,0,0,117,404,https://github.com/blackarbsceo/mixture_models...
7,0,0,2,407bbe93bbfd2d0a1e3bcd10e2571a61a55661aa,"https://github.com/binder-examples/bokeh,https..."
8,0,0,2,43541f4f7f0d41b7941bc2920c6cb95b3f1ae2d3,"https://github.com/sungkeunlim/markjay4k,https..."
9,0,0,2,43730859d5bc7441de486e8f66219a001e2e13a3,"https://github.com/valquir/labaqua,https://git..."


#### Forks

In [17]:
repos.fillna(-1).groupby("fork").size()
# query = f'select fork, count(*) from {repo_table} group by "fork";'
# forks = pd.read_sql_query(query, db.conn)

fork
-1         5
0      39851
1       4674
404     5735
dtype: int64

fork=404 means that repo doesnt exist anymore.

now check those 5 repos which have no fork info

In [18]:
repos[repos["fork"].isnull()][["repo_url", "fork", "resolved_ref_now"]]

Unnamed: 0,repo_url,fork,resolved_ref_now
1035,https://github.com/gx158/python,,409
8419,https://github.com/amit112amit/notebook,,409
16581,https://github.com/doublelabyrinth/mobaxterm-k...,,451
25415,https://github.com/abhat222/data-science--chea...,,451
49030,https://github.com/juholland/smooth_with_voila2,,409


These 3 repos returns "HTTP 409: Conflict" (because they are empty)
- https://github.com/gx158/python
- https://github.com/amit112amit/notebook
- https://github.com/juholland/smooth_with_voila2

These 2 repos returns "HTTP 451: Unavailable for Legal Reasons"
- https://github.com/doublelabyrinth/mobaxterm-keygen
- https://github.com/abhat222/data-science--cheat-sheet

In [19]:
len(repos[repos["fork"] == "404"])

5735

In [20]:
len(repos[(repos["resolved_ref_now"] == "404") & (repos["fork"] == "404")])

5735

Some repos exists but resolved_ref_now of their spec doesn't

TODO what are reasons for this?
1. spec is not valid anymore, e.g. branch is deleted

In [21]:
# select fork, remote_id, last_spec, resolved_ref_now,refs,repo_url from repo where resolved_ref_now=404 and fork!=404;
repos[(repos["resolved_ref_now"] == "404") & 
      (repos["fork"] != "404")][["repo_url", "remote_id", "fork", "resolved_ref_now", "last_spec"]]

Unnamed: 0,repo_url,remote_id,fork,resolved_ref_now,last_spec
70,https://github.com/delsim/jupyter-plotly-dash,157007576,1,404,delsim/jupyter-plotly-dash/messaging
270,https://github.com/jhamman/dask-examples,156001234,1,404,jhamman/dask-examples/xarray
455,https://github.com/pymlvizard/mlpyviz,147707842,0,404,PyMLVizard/MLPyViz/develop_GD
785,https://github.com/binderhub-ci-repos/requirem...,106039107,1,404,binderhub-ci-repos/requirements/1593550219.663...
1163,https://github.com/blackarbsceo/mixture_models,85614388,0,404,BlackArbsCEO/Mixture_Models/K-Means
...,...,...,...,...,...
49727,https://github.com/ajbozarth/elyra,246361602,1,404,ajbozarth/elyra/poster
49850,https://github.com/betatim/mcl-dsci-011-progra...,275013051,1,404,betatim/MCL-DSCI-011-programming-in-python/pat...
49890,https://github.com/spinicist/quit,37066948,0,404,spinicist/QUIT/mybinder
50105,https://github.com/pykeen/pykeen,242672435,0,404,pykeen/pykeen/update-docs


In [22]:
# if fork is 404, remote_id should be null (not available)
repos[repos["remote_id"].isnull()].fillna(-1).groupby("fork", as_index = False).size()

fork
-1        5
404    5735
dtype: int64

#### Dockerfile repos

These are repos which has dockerfile as binder config.

In [23]:
repos.fillna(-1).groupby("dockerfile").size()

dockerfile
-1.0     6032
 0.0    41969
 1.0     2264
dtype: int64

In [24]:
len(repos[repos["dockerfile"].isnull()])

6032

In [25]:
# query = "select resolved_ref_now from repo where dockerfile is null group by resolved_ref_now;"
repos[repos["dockerfile"].isnull()].groupby("resolved_ref_now", as_index = False).size()
# repos[repos["dockerfile"].isnull()].value_counts("resolved_ref_now")

resolved_ref_now
404    6027
409       3
451       2
dtype: int64

In [26]:
repos[repos["dockerfile"] == 1][["provider", "repo_url", "fork", "resolved_ref_now"]]

Unnamed: 0,provider,repo_url,fork,resolved_ref_now
10,GitHub,https://github.com/annefou/metos_python,0,bbb499dff1e08ae0d2ff4ec92a6a3aa432dc2bce
23,GitHub,https://github.com/underworldcode/underworld2-...,0,0ead44853a3842941c1feacc0dfe08766c954491
27,GitHub,https://github.com/cpgonzal/nanodj,1,7381de4cd04e15c0820cd8e6311549f093ad5718
47,GitHub,https://github.com/binder-examples/jupyter-stacks,0,b33625eac93e25f8433ed56f6ad09f000f2987e8
51,GitHub,https://github.com/ncc-74205/jupyter_octave,0,44ad45b2df4b5b1a5a619f1cde5697fca8ba30be
...,...,...,...,...
50163,GitHub,https://github.com/vmmelo/testing-cin,0,a1b42e9648e60c8a231237455fd35e913f7fbe3a
50189,GitHub,https://github.com/aims/docker-example,0,97a6dae7178ad92bbd2bc4260fa0d6ec22103073
50191,GitHub,https://github.com/dbarneche/docker-example,1,bee61e45fb4f580c11df70aa8c40d84300e01d1c
50201,GitHub,https://github.com/hackfin/hdlplayground,0,f116e658050f7a2a94408d8f25c45282b6a422d6


#### Renamed repos
If repo x is renamed to y, then both rows have same "remote_id" and have "renamed" as 1

In [27]:
repos.fillna(-1).groupby("renamed").size()

renamed
-1.0     5740
 0.0    42577
 1.0     1948
dtype: int64

In [28]:
# if renamed info is not available, this means repo doesnt exists
repos[repos["renamed"].isnull()].fillna(-1).groupby("fork", as_index = False).size()

fork
-1        5
404    5735
dtype: int64

In [29]:
# list renamed repos
query = 'select fork, renamed, remote_id, GROUP_CONCAT(DISTINCT repo_url) AS repo_urls from repo where renamed=1 group by "remote_id";'
pd.read_sql_query(query, db.conn)

Unnamed: 0,fork,renamed,remote_id,repo_urls
0,0,1,102113664,"https://github.com/eamonnmag/cern-csc-2018,htt..."
1,0,1,102488943,https://github.com/belfasttechtraining/python-...
2,0,1,103228562,https://github.com/rambasnet/thinkpythonnotebo...
3,0,1,103370184,"https://github.com/stklik/crest,https://github..."
4,0,1,103943609,"https://github.com/jordan-melendez/gsum,https:..."
...,...,...,...,...
940,0,1,98531561,"https://github.com/quantstack/xwidgets,https:/..."
941,0,1,99154413,"https://github.com/swyddfa/stylo,https://githu..."
942,0,1,99244384,"https://github.com/qiskit/qiskit-tutorial,http..."
943,0,1,d47484ac43a964fcea6547f36a5fdaa0,https://gist.github.com/akuzmanoski/d47484ac43...


In [30]:
# list renamed repos (in pandas)
repos[repos["renamed"] == 1].\
    groupby(["fork", "renamed", "remote_id"])["repo_url"].\
    apply(",".join).reset_index()

Unnamed: 0,fork,renamed,remote_id,repo_url
0,0,1.0,102113664,"https://github.com/eamonnmag/cern-csc-2018,htt..."
1,0,1.0,102488943,https://github.com/belfasttechtraining/python-...
2,0,1.0,103228562,https://github.com/rambasnet/thinkpythonnotebo...
3,0,1.0,103370184,"https://github.com/stklik/crest,https://github..."
4,0,1.0,103943609,"https://github.com/jordan-melendez/gsum,https:..."
...,...,...,...,...
940,1,1.0,273921616,https://github.com/jan-janssen/lammps-tutorial...
941,1,1.0,52304286,"https://github.com/hainm/nglview-notebooks,htt..."
942,1,1.0,90872423,https://github.com/islast/brainnetworksinpytho...
943,1,1.0,95946535,https://github.com/binder-examples/julia-pytho...


### Number of repos

In [31]:
# db[repo_table].count
len(repos)

50265

### Number of unique repos

In [32]:
# renamed repos have the same remote_id
len(repos.groupby("remote_id"))

43522

In [33]:
query = f"select count(*) from (select * from {repo_table} group by remote_id);"
db.conn.execute(query).fetchone()

(43523,)

In [34]:
# unique repos which are not fork
query = f"select count(*) from (select * from {repo_table} group by remote_id) where fork=0;"
db.conn.execute(query).fetchone()

(38906,)

In [35]:
# unique repos, which are not forked or dockerfile
query = f"select count(*) from (select * from {repo_table} group by remote_id) where fork=0 and dockerfile=0;"
db.conn.execute(query).fetchone()

(36971,)