In [15]:
import pandas as pd
import altair as alt
from pathlib import Path

In [21]:
data = []
for ifile in Path("../data").glob("*.json"):
    if ifile.name.startswith("2021"):
        data.append(pd.read_json(ifile))
data = pd.concat(data).reset_index(drop=True)

## Launches by provider

In [22]:
provider = data.groupby(['provider', 'date']).agg({'launches': 'sum'}).reset_index()
alt.Chart(provider).mark_line().encode(
    x="date",
    y=alt.Y("launches", scale=alt.Scale(type="log")),
    color="provider"
)

## Launches by origin

In [26]:
origin = data.groupby(['origin', 'date']).agg({'launches': 'sum'}).reset_index()
alt.Chart(origin).mark_line().encode(
    x="date",
    y=alt.Y("launches", scale=alt.Scale(type="log")),
    color="origin"
)

## Unique repositories

In [27]:
unique = data.groupby("provider").agg({"spec": "nunique"}).reset_index().rename(columns={"spec": "n_repos"})
alt.Chart(unique, width=400, height=200, title="Unique Repositories").mark_bar().encode(
    x='provider',
    y=alt.Y("n_repos", scale=alt.Scale(type="log")),

)

In [10]:
unique

Unnamed: 0,provider,n_repos
0,Dataverse,15
1,Figshare,22
2,Gist,1208
3,Git,40380
4,GitHub,44416
5,GitLab,899
6,Hydroshare,59
7,Zenodo,68


## Most popular repositories

In [28]:
launches = data.groupby(['spec']).agg({'launches': 'sum'}).sort_values('launches', ascending=False)
top100 = launches.head(100)

In [33]:
launches.head(100).sample(10)

Unnamed: 0_level_0,launches
spec,Unnamed: 1_level_1
lewagon/intro-to-data-science-env/master,3857
ELC/8fdc0f490b3058872a7014f01416dfb6/master,69236
spyder-ide/binder-environments/spyder-stable,16613
scikit-image/scikit-image/main,4894
https%3A%2F%2Fgit.tu-berlin.de%2Fdima%2Fisda%2Fisda-ss21.git/HEAD,4962
rationalmatter/juno-demo-notebooks/master,4153
ipydatabook/ipydatabook.github.io/gh-pages,6731
Microsoft/QuantumKatas/main,12262
tfavory/micro-notebooks/HEAD,4658
nlihin/my-binder/HEAD,3780


## Keyword search

In [62]:
bio_words = ["dna", "rna", "bio"]

In [63]:
word_subset = launches[[any(word in ii for word in bio_words) for ii in launches.reset_index()["spec"]]]
print(f"N bio-related repositories: {word_subset.shape[0]}")
word_subset.head(50)

N bio-related repositories: 445


Unnamed: 0_level_0,launches
spec,Unnamed: 1_level_1
applied-bioinformatics/built-iab/master,1586
rafelyall%2Fbiology-on-the-command-line/master,1039
AstrobioMike/binder-unix-intro/master,923
AstrobioMike/Elinne-RNA-binder/master,646
michelbierlaire/biogeme/master,518
symbionts/notebooks/master,347
AstrobioMike/binder-R-basics/master,339
ucla-biostat-257-2020spring/ucla-biostat-257-2020spring.github.io/master,331
LaCTAD/stairway-to-biopython/master,275
biosignalsplux/biosignalsnotebooks/mybinder_complete,256
