# Kubernetes activity reflected in GitHub act|ivity

Plot metrics related to GitHub repo activity over time, including number of starred repos, commits etc.

Notebook is meant to be *very* portable - no use of distributed FS or processing.

In [None]:
# All imports required for the notebook - use conda or pip to install missing libs
import requests
import re
import pandas as pd
import sqlite3
import matplotlib.pyplot as plot

In [None]:
# Define functions to be used for plotting

def subset(org_and_repo):
    org, repo = org_and_repo.split('/')
    return data.loc[(data.organization == org) & (data.repository == repo)]

def merge_repos(repos, metric='stargazers'):
    data = pd.DataFrame(columns=['Date'])
    for repo in repos:
        repo_data = subset(repo)
        repo_data = repo_data[['date', metric]].rename(columns={metric: repo, 'date': 'Date'})
        data = data.merge(repo_data, on='Date', how='outer')
    return data.sort_values(by='Date')

def plot_repos(title, repos, metric='stargazers'):
    plot_data = merge_repos(repos, metric)
    plot_data.plot(x='Date', y=repos, title=title, figsize=(15,10))

def plot_all_repos(title, org, metric='stargazers'):
    repo_data = data.loc[data.organization == org]
    repos = [org + '/' + repo for repo in repo_data.repository.unique()]
    plot_repos(title, repos, metric)
    


## Load data from last survey

In [None]:
conn = sqlite3.connect('../../data/github.db')
data = pd.read_sql_query("""
SELECT strftime('%Y-%m-01', date) AS date, organization, repository,
MAX(stargazers) AS stargazers, MAX(forks) AS forks
FROM github_stats
GROUP BY organization, repository, strftime('%Y-%m', date)
""", conn)

## Gather the latest in search keywords

"Awesome Operators" is a community-maintained list of K8s operators

> Operators are Kubernetes native applications. We define native as being both managed using the Kubernetes APIs via kubectl and ran on Kubernetes as containers. Operators take advantage of Kubernetes’s extensibility to deliver the automation advantages of cloud services like provisioning, scaling, and backup/restore while being able to run anywhere that Kubernetes can run.

Let's grab the latest

In [None]:
f =open('../../data/awesome_operators.txt', 'w')
awesome_re = re.compile("\[([^\]]+/[^\]]+)\]")
r = requests.get('https://raw.githubusercontent.com/operator-framework/awesome-operators/master/README.md')
operators = awesome_re.findall(r.text)

for operator in operators:
    f.write(operator + '\n')
f.close()

## Top repos by stars (number of stargazers)

In [None]:
awesome_repo_names = [l.rstrip('\n') for l in open('../../data/awesome_operators.txt').readlines()]
def name_and_stars(name):
    awesome_data = subset(name)
    max_stars = awesome_data['stargazers'].max()
    return (name, max_stars)
awesome_data = pd.DataFrame(map(name_and_stars, awesome_repo_names), columns=['repository', 'stargazers']).dropna()

n = 10
top_n_operators = awesome_data.sort_values(by='stargazers', ascending=False).head(n)
top_n_operators

## Top stargazers visualized

In [None]:
top_n_operators.plot(x='repository', y='stargazers', kind='bar', figsize=(20,10), title='Github stars for operators listed in awesome-operators')

## Analytics over time

The following plots show various categories of activity over time

In [None]:
repos = [
    'tensorflow/tensorflow',
    'apache/incubator-mxnet',
    'Microsoft/CNTK',
    'BVLC/caffe',
    'keras-team/keras',
    'Theano/Theano',
    'amzn/amazon-dsstne',
    'pytorch/pytorch'
]
plot_repos('Machine Learning', repos)

In [None]:
repos = [
    'kubeflow/kubeflow',
    'tensorflow/tfx'
]
plot_repos('Machine Learning Platforms', repos)

In [None]:
repos = [
    'apache/spark',
    'apache/ignite',
    'apache/flink',
    'apache/beam',
    'apache/storm',
    'apache/samza'
]
plot_repos('Big Data', repos)

In [None]:
repos = [
    'fnproject/fn',
    'openfaas/faas',
    'apache/incubator-openwhisk',
    'fission/fission',
    'apex/apex',
    'serverless/serverless',
    'kubeless/kubeless',
    'knative/serving'
]

plot_repos('Functions as a Service', repos)

In [None]:
repos = [
    'mesosphere/marathon',
    'hashicorp/nomad',
    'kubernetes/kubernetes',
    'docker/swarm',
    'docker/swarmkit'
]

plot_repos('Container Orchestration', repos)

In [None]:
repos = [
    'jupyter/notebook',
    'apache/zeppelin',
    'jupyterhub/jupyterhub'
]

plot_repos('Data Science Notebooks', repos)

In [None]:
plot_all_repos('Kubernetes', 'kubernetes')

In [None]:
plot_all_repos('Operator Framework', 'operator-framework')

In [None]:
repos = [
    'operator-framework/operator-sdk',
    'kubernetes-sigs/kubebuilder',
    'kubernetes-sigs/controller-runtime',
    'kudobuilder/kudo',
    'bluek8s/kubedirector',
    'kubedb/operator',
    'rook/operator-kit',
    'GoogleCloudPlatform/metacontroller',
    'zalando-incubator/kopf'
]

plot_repos('Operator Tools', repos)

In [None]:
plot_all_repos('Kubernetes SIGs', 'kubernetes-sigs')

In [None]:
most_stars = data.groupby(['organization', 'repository']).max().sort_values(['stargazers'])
most_stars

In [None]:
plot_all_repos('Gardener', 'gardener')

In [None]:
repos = [
    'istio/istio',
    'linkerd/linkerd2'
]
plot_repos('Service Mesh', repos)

In [None]:
repos = [
    'operator-framework/operator-sdk',
    'kubernetes-sigs/kubebuilder',
    'kudobuilder/kudo',
    'bluek8s/kubedirector',
    'kubedb/operator',
    'rook/operator-kit',
    'GoogleCloudPlatform/metacontroller',
    'zalando-incubator/kopf'
]

plot_data = merge_repos(repos, 'forks')
plot_data