# Fetch all GitHub Release info for CNCF projects

In [1]:
# graphviz - suffers from a bug --> https://github.com/ContinuumIO/anaconda-issues/issues/485
%pip list | grep -E 'pandas|dask|sqlalchemy|psycopg2-binary|PyGitHub|python-dotenv|jsonpath-ng'
%pip install pandas dask sqlalchemy psycopg2-binary PyGitHub python-dotenv jsonpath-ng 

dask                               2023.10.1
dask_labextension                  7.0.0
jsonpath-ng                        1.6.0
pandas                             2.0.3
psycopg2-binary                    2.9.9
python-dotenv                      1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip list installed -v

Package                            Version      Location                                                                Installer
---------------------------------- ------------ ----------------------------------------------------------------------- ---------
aiohttp                            3.8.6        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
aiohttp-retry                      2.8.3        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
aiosignal                          1.3.1        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
altair                             5.1.2        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
annotated-types                    0.6.0        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
anyio                              4.0.0        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages 

In [3]:
from IPython.display import display, HTML

import os
import pandas as pd
import dask
import dask.dataframe as dd

from typing import Any

import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

from dotenv import load_dotenv
load_dotenv()

True

In [4]:
def safe_set_index(df: pd.DataFrame, idx_wanted: list[str]) -> pd.DataFrame:

    # check to see if the index is already set, else, data loss as set_index can be destructive
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        df.set_index(idx_wanted, verify_integrity=True, inplace=True) # note: index must be unique!
        df.sort_index(inplace=True)
    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_name'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe/
    
    # expand=True returns a dataframe  which we can rename columns on
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)

    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

#############

def load_repos(fname: str=None, 
               splitcols: bool=False) -> pd.DataFrame:
    '''Load repos from a file'''

    with open(fname, 'r') as f:
        df = pd.DataFrame(f.readlines(), columns=['name'])

    # strip comments (note '~' negation in selector)
    df = df[~df['name'].astype(str).str.startswith('#')]

    # the universe is inperfect, as is input data
    df.name = df.name.str.rstrip(to_strip='\n')
    df.name = df.name.str.rstrip(to_strip='/')

    # {name: someOrg/someRepo} --> { name: 'someOrg/someRepo', org_name = 'someOrg', repo_name = 'someRepo'
    if splitcols:
        df = split_org_repo(df, colname='name')
    return df

In [5]:
from dask.distributed import Client

client = Client(n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 64.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:64179,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 64.00 GiB

0,1
Comm: tcp://127.0.0.1:64190,Total threads: 4
Dashboard: http://127.0.0.1:64192/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:64182,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-0crqtzj_,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-0crqtzj_

0,1
Comm: tcp://127.0.0.1:64191,Total threads: 4
Dashboard: http://127.0.0.1:64195/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:64184,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-totdv82m,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-totdv82m

0,1
Comm: tcp://127.0.0.1:64194,Total threads: 4
Dashboard: http://127.0.0.1:64198/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:64186,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-94opi8xt,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-94opi8xt

0,1
Comm: tcp://127.0.0.1:64197,Total threads: 4
Dashboard: http://127.0.0.1:64200/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:64188,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-vddb9kib,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-vddb9kib


In [6]:
# https://landscape.cncf.io/data/items.json
LANDSCAPE_FNAME = '../../landscape-items.json'

# repo_urls = [f'https://github.com/{org_repo}' for org_repo in osrb.name]
# repo_urls[:5]

use_dask = False


if use_dask:
    ddf = dd.read_json(LANDSCAPE_FNAME, orient='records', blocksize="2MB")
    ddf.compute()
    display(ddf.visualize())
    display(ddf.head())
else:
    df = pd.read_json(LANDSCAPE_FNAME, orient='records')
    display(df.head())



Unnamed: 0,name,homepage_url,repo_url,logo,twitter,crunchbase,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,path,landscape,category,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,relation,isSubsidiaryProject,project,extra,amountKind,yahoo_finance_data,allow_duplicate_repo,joined,url_for_bestpractices,project_org,open_source,enduser,stock_ticker,second_path,branch,hideLicense,unnamed_organization
0,Airship,https://www.airshipit.org/,https://github.com/airshipit/treasuremap,airship.svg,https://twitter.com/airshipproject,https://www.crunchbase.com/organization/open-i...,"{'languages': [{'name': 'Shell', 'value': 3022...",[{'url': 'https://github.com/airshipit/treasur...,{'start_commit_link': '/airshipit/treasuremap/...,"{'fileName': 'airship.svg', 'hash': 'kS96vnkPA...",2018-08-14T23:50:18Z,https://github.com/airshipit/treasuremap/commi...,2021-11-18T19:35:10Z,https://github.com/airshipit/treasuremap/commi...,2021-12-06T20:04:17Z,https://github.com/airshipit/treasuremap/releases,0,117.0,https://github.com/airshipit/treasuremap/graph...,Shell,49.0,Apache License 2.0,"Austin, Texas","{'text': '7 months ago', 'value': '900', 'orig...","Reference Airship manifests, CICD, and referen...",Open Infrastructure Foundation,"{'name': 'Open Infrastructure Foundation', 'de...",Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,,True,logos/airship.svg,0,,[],True,49.0,False,,airship,Airship,False,False,False,,,,,,,,,,,,,,,
1,Akri,https://docs.akri.sh,https://github.com/project-akri/akri,akri.svg,https://twitter.com/ProjectAkri,https://www.crunchbase.com/organization/cloud-...,"{'languages': [{'name': 'Rust', 'value': 10535...",[{'url': 'https://github.com/project-akri/akri...,{'start_commit_link': '/project-akri/akri/comm...,"{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",2020-10-14T00:42:19Z,https://github.com/project-akri/akri/commit/94...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/project-akri/akri/commit/98...,"{'text': '7 months ago', 'value': '900', 'orig...",https://github.com/project-akri/akri/releases,197,34.0,https://github.com/project-akri/akri/graphs/co...,Rust,996.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",A Kubernetes Resource Interface for the Edge,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,3000000.0,True,logos/akri.svg,5339,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,996.0,True,$3M,akri,Akri,False,sandbox,False,sandbox,"{'accepted': '2021-09-14', 'annual_review_date...",funding,,,,,,,,,,,,
2,Ansible,https://www.ansible.com/,https://github.com/ansible/ansible,ansible.svg,https://twitter.com/ansible,https://www.crunchbase.com/organization/red-hat,"{'languages': [{'name': 'Python', 'value': 927...","[{'url': 'https://github.com/ansible/ansible',...",{'start_commit_link': '/ansible/ansible/commit...,"{'fileName': 'ansible.svg', 'hash': 'XHYuAuMuT...",2012-02-23T19:17:24Z,https://github.com/ansible/ansible/commit/f314...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/ansible/ansible/commit/7d98...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/ansible/ansible/releases,959,6780.0,https://github.com/ansible/ansible/graphs/cont...,Python,58659.0,GNU General Public License v3.0,"Raleigh, North Carolina","{'text': '7 months ago', 'value': '900', 'orig...",Ansible is a radically simple IT automation pl...,Red Hat,"{'name': 'Red Hat', 'description': 'Red Hat is...",Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,,True,logos/ansible.svg,2372,66.0,"[Enterprise Software, InsurTech, Linux, Open S...",True,58659.0,True,$M,ansible,Ansible,Platinum,member,False,,,,{'effective_ticker': 'IBM'},,,,,,,,,,,
3,Apollo,https://www.apolloconfig.com/,https://github.com/apolloconfig/apollo,apollo.svg,,https://www.crunchbase.com/organization/ctrip,"{'languages': [{'name': 'Java', 'value': 22675...",[{'url': 'https://github.com/apolloconfig/apol...,{'start_commit_link': '/ctripcorp/apollo/commi...,"{'fileName': 'apollo.svg', 'hash': 'peWJyyZdPz...",2016-03-04T10:24:23Z,https://github.com/ctripcorp/apollo/commit/7f8...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/apolloconfig/apollo/commit/...,"{'text': '10 months ago', 'value': '870', 'ori...",https://github.com/apolloconfig/apollo/releases,96,170.0,https://github.com/apolloconfig/apollo/graphs/...,Java,28360.0,Apache License 2.0,"Singapore, Singapore",,Apollo is a reliable open-source configuration...,Trip.com,"{'name': 'Trip.com', 'description': 'Trip.com ...",Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,,True,logos/apollo.svg,0,,"[Hospitality, Tourism, Travel]",True,28360.0,True,$M,apollo,Apollo,False,False,False,,,,{'effective_ticker': 'CTRP.VI'},,,,,,,,,,,
4,AWS CloudFormation,https://aws.amazon.com/cloudformation/,,aws-cloudformation.svg,https://twitter.com/awscloudformer,https://www.crunchbase.com/organization/amazon...,,,,"{'fileName': 'aws-cloud-formation.svg', 'hash'...",,,,,,,0,,,,,NotOpenSource,"Seattle, Washington","{'text': '7 months ago', 'value': '900', 'orig...",AWS CloudFormation provides a common language ...,Amazon Web Services,"{'name': 'Amazon Web Services', 'description':...",Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,,False,logos/aws-cloud-formation.svg,0,,"[Consulting, DevOps, Information Services, Inf...",True,,True,$M,aws-cloud-formation,AWS CloudFormation,Platinum,member,False,,,,{'effective_ticker': 'AMZN'},,,,,,,,,,,


In [7]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2268 entries, 0 to 2267
Data columns (total 60 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2268 non-null   object 
 1   homepage_url              2268 non-null   object 
 2   repo_url                  753 non-null    object 
 3   logo                      2268 non-null   object 
 4   twitter                   1870 non-null   object 
 5   crunchbase                2267 non-null   object 
 6   github_data               753 non-null    object 
 7   repos                     753 non-null    object 
 8   github_start_commit_data  753 non-null    object 
 9   image_data                2268 non-null   object 
 10  firstCommitDate           753 non-null    object 
 11  firstCommitLink           753 non-null    object 
 12  latestCommitDate          753 non-null    object 
 13  latestCommitLink          753 non-null    object 
 14  releaseD

In [8]:
ddf.transpose().head()

NameError: name 'ddf' is not defined

## Fetch project release data from GitHub API

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    df = pd.DataFrame(columns=[
        'repo_name', 'release_name', 'release_date', 
        'language', 'release_notes'
    ])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        # Skip repository if already done
        if repo_str in state['repos_done']:
            print(f"Skipping {repo_str}")
            state['i'] += 1
            continue

        # Get repository
        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        # Get all releases
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df



In [None]:
# Example usage
token = os.environ['GITHUB_TOKEN']
repos = osrb.name
since_date = datetime(2023, 1, 1, tzinfo=timezone.utc)
json_file = "out/osrb-github-releases.json"
csv_file = "out/osrb-github-releases.csv"
state_file = "out/osrb-github-releases.state.json"

releases = fetch_repo_data(token, 
                           repos, 
                           since=since_date, 
                           json_file=json_file, 
                           csv_file=csv_file, 
                           state_file=state_file)
releases

## Visualize Releases

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

def plot_releases_timeline(releases: pd.DataFrame):
    fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
    fig.update_yaxes(autorange="reversed")
    fig.show()

def plot_releases_scatter_simple(releases: pd.DataFrame):
    # Filter releases by year
    releases_2023 = releases[releases['release_date'].dt.year == 2023]

    # Create scatter plot
    fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language", title="GitHub Releases Scatter Plot")
    fig.update_yaxes(autorange="reversed")
    fig.show()

def plot_releases_scatter(releases: pd.DataFrame):
    # Filter releases by year
    releases_2023 = releases[releases['release_date'].dt.year == 2023]

    # Group releases by organization
    releases_2023['organization'] = releases_2023['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="organization", symbol="language", title="GitHub Releases Scatter Plot")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000,
                      height=2500,
                      ),
    fig.show()

In [None]:
import pandas as pd
import os

debug_releases = releases.copy()
    
# Load releases data from CSV file
csv_file = "out/releases.csv"
if os.path.exists(csv_file):
    releases = pd.read_csv(csv_file)
else:
    print(f"CSV file {csv_file} not found")

releases.release_date = pd.to_datetime(releases.release_date)

plot_releases_scatter(releases)

In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo