# Fetch all GitHub Release info for CNCF projects

In [4]:
# graphviz - suffers from a bug --> https://github.com/ContinuumIO/anaconda-issues/issues/485
%pip list | grep -E 'pandas|dask|sqlalchemy|psycopg2-binary|PyGitHub|python-dotenv|jsonpath-ng'
%pip install pandas dask sqlalchemy psycopg2-binary PyGitHub python-dotenv jsonpath-ng 

dask                               2023.10.1
dask_labextension                  7.0.0
jsonpath-ng                        1.6.0
pandas                             2.0.3
psycopg2-binary                    2.9.9
python-dotenv                      1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip list installed -v

Package                            Version      Location                                                                Installer
---------------------------------- ------------ ----------------------------------------------------------------------- ---------
aiohttp                            3.8.6        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
aiohttp-retry                      2.8.3        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
aiosignal                          1.3.1        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
altair                             5.1.2        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
annotated-types                    0.6.0        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages pip
anyio                              4.0.0        /Users/me/gh/cncf/landscape-graph/.venv-nb/lib/python3.11/site-packages 

In [6]:
from IPython.display import display, HTML

import os
import pandas as pd
import dask
import dask.dataframe as dd

from typing import Any

import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

from dotenv import load_dotenv
load_dotenv()

True

In [7]:
def safe_set_index(df: pd.DataFrame, idx_wanted: list[str]) -> pd.DataFrame:

    # check to see if the index is already set, else, data loss as set_index can be destructive
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        df.set_index(idx_wanted, verify_integrity=True, inplace=True) # note: index must be unique!
        df.sort_index(inplace=True)
    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_name'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe/
    
    # expand=True returns a dataframe  which we can rename columns on
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)

    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

#############

def load_repos(fname: str=None, 
               splitcols: bool=False) -> pd.DataFrame:
    '''Load repos from a file'''

    with open(fname, 'r') as f:
        df = pd.DataFrame(f.readlines(), columns=['name'])

    # strip comments (note '~' negation in selector)
    df = df[~df['name'].astype(str).str.startswith('#')]

    # the universe is inperfect, as is input data
    df.name = df.name.str.rstrip(to_strip='\n')
    df.name = df.name.str.rstrip(to_strip='/')

    # {name: someOrg/someRepo} --> { name: 'someOrg/someRepo', org_name = 'someOrg', repo_name = 'someRepo'
    if splitcols:
        df = split_org_repo(df, colname='name')
    return df

In [8]:
from dask.distributed import Client

client = Client(n_workers=4)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 60084 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:60084/status,

0,1
Dashboard: http://127.0.0.1:60084/status,Workers: 4
Total threads: 16,Total memory: 64.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60085,Workers: 4
Dashboard: http://127.0.0.1:60084/status,Total threads: 16
Started: Just now,Total memory: 64.00 GiB

0,1
Comm: tcp://127.0.0.1:60096,Total threads: 4
Dashboard: http://127.0.0.1:60102/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:60088,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-e0w5fg5a,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-e0w5fg5a

0,1
Comm: tcp://127.0.0.1:60099,Total threads: 4
Dashboard: http://127.0.0.1:60103/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:60090,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-qhv8ys7_,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-qhv8ys7_

0,1
Comm: tcp://127.0.0.1:60097,Total threads: 4
Dashboard: http://127.0.0.1:60100/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:60092,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-bfma3png,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-bfma3png

0,1
Comm: tcp://127.0.0.1:60098,Total threads: 4
Dashboard: http://127.0.0.1:60101/status,Memory: 16.00 GiB
Nanny: tcp://127.0.0.1:60094,
Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-e9e52o21,Local directory: /var/folders/bk/xpkns7b54cg2xd10jt1m17cm0000gn/T/dask-scratch-space/worker-e9e52o21


In [9]:
# https://landscape.cncf.io/data/items.json
LANDSCAPE_FNAME = '../../landscape-items.json'

# repo_urls = [f'https://github.com/{org_repo}' for org_repo in osrb.name]
# repo_urls[:5]

use_dask = False


if use_dask:
    ddf = dd.read_json(LANDSCAPE_FNAME, orient='records', blocksize="2MB")
    ddf.compute()
    display(ddf.visualize())
    display(ddf.head())
else:
    df = pd.read_json(LANDSCAPE_FNAME, orient='records')
    display(df.head())



ValueError: Expected object or value

In [None]:
df.info(memory_usage='deep')

<class 'dask.dataframe.core.DataFrame'>
Columns: 2077 entries, 0 to 2076
dtypes: string(2077)
memory usage: 8.9 MB


In [None]:
ddf.transpose().head()

AttributeError: 'DataFrame' object has no attribute 'transpose'

## Fetch project release data from GitHub API

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    df = pd.DataFrame(columns=[
        'repo_name', 'release_name', 'release_date', 
        'language', 'release_notes'
    ])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        # Skip repository if already done
        if repo_str in state['repos_done']:
            print(f"Skipping {repo_str}")
            state['i'] += 1
            continue

        # Get repository
        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        # Get all releases
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df



In [None]:
# Example usage
token = os.environ['GITHUB_TOKEN']
repos = osrb.name
since_date = datetime(2023, 1, 1, tzinfo=timezone.utc)
json_file = "out/osrb-github-releases.json"
csv_file = "out/osrb-github-releases.csv"
state_file = "out/osrb-github-releases.state.json"

releases = fetch_repo_data(token, 
                           repos, 
                           since=since_date, 
                           json_file=json_file, 
                           csv_file=csv_file, 
                           state_file=state_file)
releases

## Visualize Releases

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

def plot_releases_timeline(releases: pd.DataFrame):
    fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
    fig.update_yaxes(autorange="reversed")
    fig.show()

def plot_releases_scatter_simple(releases: pd.DataFrame):
    # Filter releases by year
    releases_2023 = releases[releases['release_date'].dt.year == 2023]

    # Create scatter plot
    fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language", title="GitHub Releases Scatter Plot")
    fig.update_yaxes(autorange="reversed")
    fig.show()

def plot_releases_scatter(releases: pd.DataFrame):
    # Filter releases by year
    releases_2023 = releases[releases['release_date'].dt.year == 2023]

    # Group releases by organization
    releases_2023['organization'] = releases_2023['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="organization", symbol="language", title="GitHub Releases Scatter Plot")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000,
                      height=2500,
                      ),
    fig.show()

In [None]:
import pandas as pd
import os

debug_releases = releases.copy()
    
# Load releases data from CSV file
csv_file = "out/releases.csv"
if os.path.exists(csv_file):
    releases = pd.read_csv(csv_file)
else:
    print(f"CSV file {csv_file} not found")

releases.release_date = pd.to_datetime(releases.release_date)

plot_releases_scatter(releases)

In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo