# Analysing mybinder.org launches

The first few cells download and massage the data. Later on we answer questions on which repositories are popular and such.

In [22]:
%matplotlib inline
from IPython.display import clear_output
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import timeit
import json
import os

## Update local archive storage
<b>1.</b> Load the up-to-date list of> the online archive files

In [23]:
online_index = pd.read_json("https://archive.analytics.mybinder.org/index.jsonl", lines=True)
online_list = list(online_index.name)[:-1] #[:-1]

<b>2.</b> Load the list of already-existing local files

In [24]:
local_index = [file for file in os.listdir('./archive')]
local_index.remove(".DS_Store")

<b>3.</b> Compare the lists and download those that haven't been downloaded yet

In [20]:
for online_file in enumerate(online_list):
    if online_file[1] not in local_index:
        # Tracking of the loop’s progress
        clear_output(wait=True)
        
        # The loop itself
        with open(f"archive/{online_file[1]}", 'w') as local_file:
            df = pd.read_json(f"https://archive.analytics.mybinder.org/{online_file[1]}",lines=True)
            df['spec'] = df['spec'].str.replace('%2F','/')
            df['spec'] = df['spec'].str.replace('%3A',':')
            df['spec'] = df['spec'].str.replace('http://','')
            df['spec'] = df['spec'].str.replace('https://','')
            local_file.write(df.to_json(orient='index'))

        # Tracking of the loop’s progress
        print("Current progress:", np.round(online_file[0]/len(online_list) * 100, 2), "%")

# Final output
clear_output()
print("Current progress: Done!")

Current progress: Done!


## Analysis of the traffic data
<b>1.</b> Define time range for which traffic data should be analyzed and load them

In [27]:
# Define the time range for the analysis
date_start = datetime(2018, 11, 3) # YYYY MM DD
date_end   = datetime(2019, 12, 11) # YYYY MM DD

# Catches any incorrect date range as it messes up the timetracking 
if date_start < datetime(2018, 11, 3):
    date_start = datetime(2018, 11, 3)
if date_end < datetime.now():
    date_end = datetime.now() - 1

# Number of days of archives included in the analysis
date_range = (date_end - date_start).days

# Tracking the loop’s progress
time_start = timeit.default_timer()

# Loading loop 
frames = []
for local_file in enumerate(local_index):
    # Tracking the loop’s progress
    clear_output(wait=True)
    
    # Datetime for local file
    time_y = int(local_file[1][7:11])
    time_m = int(local_file[1][12:14])
    time_d = int(local_file[1][15:17])
    
    # If local_file datetime is within the range of the defined time range 
    if date_start < datetime(time_y, time_m, time_d) < date_end:
        with open(f'archive/{local_file[1]}', 'r') as json_file:
            df = pd.read_json(json_file, orient='index')
            frames.append(df)

        # Tracking the loop’s progress
        time_stop = timeit.default_timer()
        
        if (local_file[0] / date_range) * 100 < 5:
            expected_time = "Calculating..."
        
        else:
            time_for_perc = timeit.default_timer()
            expected_time = (time_for_perc - time_start) / (local_file[0] / date_range)
            expected_time = str(timedelta(seconds=(expected_time)))
        
        print(f"Currently fetching data for {date_range} days.")
        print("Current progress:", np.round(local_file[0] / date_range * 100, 2), "%")
        print("Current run time:", str(timedelta(seconds=(time_stop - time_start))))
        print("Expected run time:", expected_time)

# Final output
clear_output()
print(f"Current progress: Done! Fetched data for {date_range} days")
print(f"Expected run time: {expected_time}")
print(f"Actual run time  : {str(timedelta(seconds=(time_stop - time_start)))}")

KeyboardInterrupt: 

<b>2.</b> Combine all the loaded dataframes, edit and transform some of the data

In [None]:
# concatenation of the loaded dataframes 
df = pd.concat(frames, sort=True)

# Method to simplifying the process to grab the binder-document referrence
def get_repo(spec):
    s = spec.rsplit("/", 1)[0]
    if s.endswith('.git'):
        s = s[:-4]
    return s

# Seperation of the components of the binder URLs
df['repo'] = df['spec'].apply(get_repo) #lambda s: s.rsplit("/", 1)[0].replace(".git", ""))
df['org'] = df['spec'].apply(lambda s: s.split("/", 1)[0])
df['ref'] = df['spec'].apply(lambda s: s.rsplit("/", 1)[1])

# Drop all unwanted columns of the dataframes
df = df.drop(columns=['origin', 'provider', 'schema', 'spec', 'status', 'version'])

<b>3.</b> Preview a sample of the data 

In [None]:
df.sample(10)

## Monthly, weekly, daily active repos

In [None]:
df_ = df.set_index("timestamp")

In [None]:
def n_active_repos(period=30):
    """Unique active repos over the period
    
    With period=30 this gives monthly active repos
    With period=1 you get daily active repos
    """
    now = datetime.now()
    start = datetime(2019, 1, 1)
    days_since_start = (now - start).days

    data = []

    for n in range(days_since_start):
        s = start + timedelta(days=n)
        e = start + timedelta(days=n+period)
        if e > now:
            break

        # our "monthly" data
        monthly = df_.loc[s.isoformat():e.isoformat()]
        data.append(dict(timestamp=e, repos=len(set(monthly.repo))))
        
    return pd.DataFrame.from_records(data).set_index("timestamp")

In [None]:
monthly_active = n_active_repos(30)
weekly_active = n_active_repos(7)
daily_active = n_active_repos(1)

In [None]:
activity = pd.merge(monthly_active, daily_active,
                    suffixes=("_monthly", "_daily"), left_index=True, right_index=True)
activity = pd.merge(activity, weekly_active, left_index=True, right_index=True)

activity.columns = ['30day active', '1d active', '7d active']
activity.plot();

## Total launches

In [None]:
# Sneak peek: total launches!
df.shape

## Launches per day

In [None]:
daily = df.set_index("timestamp").resample('D').count()

In [None]:
daily['repo'].plot()

## Estimate number of unique repositories

Expect the raw number of launches to be bigger than the number of repositories launched more than once. Those launched only once might have been accidents.

In [None]:
len(set(df.repo))

In [None]:
from collections import Counter

sum(1 for k,v in Counter(df.repo).items() if v > 3)

## Popular repositories and their branches

Twenty most popular repos:

In [None]:
top20 = df.groupby("repo").count().sort_values("timestamp", ascending=False).head(20)
top20

In [None]:
print("Cumulative top20 launches:", top20['ref'].sum())
print("The top20 repos are {:.1f}% of all "
      "launches.".format(100 * top20['ref'].sum() / df.shape[0]))