# Usage of Gesis Notebooks

In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import math
import pandas as pd
import requests
import time

In [18]:
from_dt = datetime.datetime(2018,10,1).isoformat()
to_dt = datetime.datetime(2020,9,30).isoformat()
url = f'https://notebooks.gesis.org/gallery/api/v1.0/launches/{from_dt}/{to_dt}'

In [None]:
launches = []
# because of pagination the api gives 100 results per page so for analysis you have to take data in all pages
next_page = 1
start = datetime.datetime.now()
while next_page is not None:
    params = {'origin': 'notebooks.gesis.org',
             'page': next_page}
    r = requests.get(url, params=params)
    response = r.json()
    # check the limit of queries per second/minute,
    message = response.get("message", "")
    if message not in ["2 per 1 second", "100 per 1 minute"]:
        launches.extend(response['launches'])
        if next_page % 100 == 0:
            timesnap = datetime.datetime.now() - start
            print(f"finished page {next_page}. Loop going for {math.floor(timesnap.seconds / 3600)}:{math.floor(timesnap.seconds % 3600 / 60)}:{timesnap.seconds % 3600 % 60}. Average time per page: {round(timesnap.seconds / next_page,3)} seconds")
        next_page = response['next_page']

    else:
        time.sleep(1)

In [None]:
data = pd.DataFrame.from_dict(launches)
data.to_csv("launches_data.csv")
len(data)

In [None]:
data = pd.read_csv("launches_data.csv")

In [None]:
totals_per_repo = (data.groupby(["spec"])
 .size()
 .reset_index(name='repo_counts')
                   .sort_values(by='repo_counts', ascending=False))
top10 = totals_per_repo.head(10)["spec"]
data_others = data
data_others.loc[~data_others['spec'].isin(list(top10)), 'spec']  = 'others'
totals_per_repo = (data_others.groupby(["spec"])
 .size()
 .reset_index(name='repo_counts')
                   .sort_values(by='repo_counts', ascending=False))

others_row = totals_per_repo.loc[totals_per_repo["spec"] == "others"]
totals_per_repo = totals_per_repo.loc[totals_per_repo["spec"] != "others"]
totals_per_repo = pd.concat([totals_per_repo, others_row])
totals_per_repo

In [None]:
plt.bar(totals_per_repo["spec"],totals_per_repo["repo_counts"])
plt.title("Most frequently launched repositories on notebooks.gesis.org")
plt.ylabel("Count of launches")
plt.xlabel("Repository")
plt.xticks(rotation=45, ha="right")
plt.show()

## Total launches, no of repositories


In [None]:
data['date']=pd.to_datetime(data['timestamp']).dt.date

repo_list = []

for day in data['date']:
    temp = data.loc[data['date'] == day].spec.unique()
    to_add = []
    #print(f"elements in temp: {len(temp)}")
    for entry in temp:
        if not any(entry in i for i in repo_list):
            to_add.append(entry)
            #print(f"elements in to_add: {len(to_add)}")
    repo_list.append(to_add)


cum_repo_list = []
for i, idx in zip(repo_list, range(0, len(repo_list))):
    count = len(i)
    if idx == 0:
        cum_repo_list.append(count)
    else: 
        cum_repo_list.append(cum_repo_list[idx - 1] + count)
    #print(f"count: {cum_repo_list[idx]}")


plt.plot(data.date.unique(), cum_repo_list)

In [12]:
len(data.spec.unique())

10413