# Fetch all GitHub Release info for CNCF projects

In [1]:
#!wget https://landscape.cncf.io/data/items.json

In [2]:
%%bash 
# array of JSON --> JSONL
jq  -c '.[]' items.json > items.jsonl

ls -lahF items.jsonl
wc -l items.jsonl

-rw-r--r--  1 matt  staff   9.7M Nov 16 14:46 items.jsonl
    2267 items.jsonl


In [3]:
%%bash
# landscape | select(CNCF Projects) --> cncf-projects.jsonl
jq -c 'select(.relation == "graduated" or .relation == "incubating" or .relation == "sandbox")' items.jsonl > cncf-projects.jsonl 

ls -lahF cncf-projects.jsonl
wc -l cncf-projects.jsonl

-rw-r--r--  1 matt  staff   3.9M Nov 16 14:46 cncf-projects.jsonl
     178 cncf-projects.jsonl


## helpers

In [4]:
import pandas as pd

pd.set_option('display.max_rows', 1024)
pd.set_option('display.max_columns', 512)
pd.set_option('display.width', 1024)

from dotenv import load_dotenv
load_dotenv()

def safe_set_index(df:         pd.DataFrame, 
                   idx_wanted: list[str]) -> pd.DataFrame:

    # check to see if the index is already set, else, data loss as set_index can be destructive
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        if idx_existing:
            print(f'*** existing index found: {idx_existing}, resetting to avoid data loss prior to setting to: {idx_wanted}')
            df.reset_index(inplace=True)

        # note: index must be unique!
        df.set_index(idx_wanted, verify_integrity=True, inplace=True)
    
    df.sort_index(inplace=True)
    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_named'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # expand=True returns df which can rename columns on # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe
    
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)
    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

## Load the Landscape 

In [5]:
import os

file_path = './cncf-projects.jsonl'

if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    df_orig = pd.read_json(file_path, lines=True)
else:
    print(f"File {file_path} does not exist or is empty.")

print(df_orig.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      178 non-null    object 
 1   homepage_url              178 non-null    object 
 2   project                   178 non-null    object 
 3   repo_url                  176 non-null    object 
 4   logo                      178 non-null    object 
 5   twitter                   174 non-null    object 
 6   crunchbase                178 non-null    object 
 7   extra                     176 non-null    object 
 8   github_data               176 non-null    object 
 9   repos                     176 non-null    object 
 10  github_start_commit_data  176 non-null    object 
 11  image_data                178 non-null    object 
 12  firstCommitDate           176 non-null    object 
 13  firstCommitLink           176 non-null    object 
 14  latestComm

In [6]:
df_orig.head(3)

Unnamed: 0,name,homepage_url,project,repo_url,logo,twitter,crunchbase,extra,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,path,landscape,category,amountKind,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,relation,isSubsidiaryProject,allow_duplicate_repo,project_org,joined,enduser,url_for_bestpractices,open_source,second_path
0,Akri,https://docs.akri.sh,sandbox,https://github.com/project-akri/akri,akri.svg,https://twitter.com/ProjectAkri,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2021-09-14', 'annual_review_date...","{'languages': [{'name': 'Rust', 'value': 10535...",[{'url': 'https://github.com/project-akri/akri...,{'start_commit_link': '/project-akri/akri/comm...,"{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",2020-10-14T00:42:19Z,https://github.com/project-akri/akri/commit/94...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/project-akri/akri/commit/98...,"{'text': '7 months ago', 'value': '900', 'orig...",https://github.com/project-akri/akri/releases,197,34.0,https://github.com/project-akri/akri/graphs/co...,Rust,996.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",A Kubernetes Resource Interface for the Edge,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/akri.svg,5339,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,996,True,$3M,akri,Akri,False,sandbox,False,,,,,,,
1,CDK for Kubernetes (CDK8s),https://cdk8s.io/,sandbox,https://github.com/cdk8s-team/cdk8s,cdk8s.svg,https://twitter.com/CloudNativeFdn,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2020-11-10', 'annual_review_url'...","{'languages': [{'name': 'JavaScript', 'value':...",[{'url': 'https://github.com/cdk8s-team/cdk8s'...,{'start_commit_link': '/cdk8s-team/cdk8s/commi...,"{'fileName': 'cdk-for-kubernetes-cdk8s.svg', '...",2019-07-25T12:45:22Z,https://github.com/cdk8s-team/cdk8s/commit/ec7...,"{'text': 'about a month', 'value': '960', 'ori...",https://github.com/cdk8s-team/cdk8s/commit/6b9...,2021-10-13T20:21:32Z,https://github.com/cdk8s-team/cdk8s/releases,434,78.0,https://github.com/cdk8s-team/cdk8s/graphs/con...,JavaScript,3818.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",CDK8s lets you define Kubernetes apps and comp...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/cdk-for-kubernetes-cdk8s.svg,0,,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,3818,True,$3M,cdk-for-kubernetes-cdk8s,CDK for Kubernetes (CDK8s),False,sandbox,False,,,,,,,
2,Cloud Custodian,https://cloudcustodian.io/,incubating,https://github.com/cloud-custodian/cloud-custo...,cloud-custodian.svg,https://twitter.com/CloudNativeFdn,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2020-06-25', 'dev_stats_url': 'h...","{'languages': [{'name': 'Python', 'value': 771...",[{'url': 'https://github.com/cloud-custodian/c...,{'start_commit_link': '/cloud-custodian/cloud-...,"{'fileName': 'cloud-custodian.svg', 'hash': '0...",2015-07-16T14:19:15Z,https://github.com/cloud-custodian/cloud-custo...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/cloud-custodian/cloud-custo...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/cloud-custodian/cloud-custo...,522,441.0,https://github.com/cloud-custodian/cloud-custo...,Python,4951.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...","Rules engine for cloud security, cost optimiza...",Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/cloud-custodian.svg,3402,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,4951,True,$3M,cloud-custodian,Cloud Custodian,False,incubating,False,,,,,,,


In [7]:
# df2=df[['B','D','F']].rename({'B':'X','D':'Y','F':'Z'}, axis=1)

df = df_orig[[
    'relation', 
    'category', 
    'path', 
    'name',
    'repo_url',
#    'repos',             # this embedded array is in practice always a single element.  
    'github_data',
    'extra',
    'commitsThisYear',
    'contributorsCount',
    'stars',
    'headquarters',
    'industries',
    'image_data',
]].copy()

old_shape = df.shape

df['repo'] = df['repo_url'].astype('string').str.removeprefix('https://github.com/')
df = split_org_repo(df, 'repo').dropna()

# TODO why do we have nulls...178 to 174
print(f'*** {old_shape} --> {df.shape}')

df.head(3)

*** (178, 13) --> (174, 16)


Unnamed: 0,relation,category,path,name,repo_url,github_data,extra,commitsThisYear,contributorsCount,stars,headquarters,industries,image_data,repo,org_name,repo_name
0,sandbox,Provisioning,Provisioning / Automation & Configuration,Akri,https://github.com/project-akri/akri,"{'languages': [{'name': 'Rust', 'value': 10535...","{'accepted': '2021-09-14', 'annual_review_date...",197,34.0,996.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",project-akri/akri,project-akri,akri
1,sandbox,Provisioning,Provisioning / Automation & Configuration,CDK for Kubernetes (CDK8s),https://github.com/cdk8s-team/cdk8s,"{'languages': [{'name': 'JavaScript', 'value':...","{'accepted': '2020-11-10', 'annual_review_url'...",434,78.0,3818.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'cdk-for-kubernetes-cdk8s.svg', '...",cdk8s-team/cdk8s,cdk8s-team,cdk8s
2,incubating,Provisioning,Provisioning / Automation & Configuration,Cloud Custodian,https://github.com/cloud-custodian/cloud-custo...,"{'languages': [{'name': 'Python', 'value': 771...","{'accepted': '2020-06-25', 'dev_stats_url': 'h...",522,441.0,4951.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'cloud-custodian.svg', 'hash': '0...",cloud-custodian/cloud-custodian,cloud-custodian,cloud-custodian


In [8]:
# clean up path to strip out the category (path := category / subpath)
df['subpath'] = df['path'].str.split('/').str[-1]
df.head(3)



Unnamed: 0,relation,category,path,name,repo_url,github_data,extra,commitsThisYear,contributorsCount,stars,headquarters,industries,image_data,repo,org_name,repo_name,subpath
0,sandbox,Provisioning,Provisioning / Automation & Configuration,Akri,https://github.com/project-akri/akri,"{'languages': [{'name': 'Rust', 'value': 10535...","{'accepted': '2021-09-14', 'annual_review_date...",197,34.0,996.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",project-akri/akri,project-akri,akri,Automation & Configuration
1,sandbox,Provisioning,Provisioning / Automation & Configuration,CDK for Kubernetes (CDK8s),https://github.com/cdk8s-team/cdk8s,"{'languages': [{'name': 'JavaScript', 'value':...","{'accepted': '2020-11-10', 'annual_review_url'...",434,78.0,3818.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'cdk-for-kubernetes-cdk8s.svg', '...",cdk8s-team/cdk8s,cdk8s-team,cdk8s,Automation & Configuration
2,incubating,Provisioning,Provisioning / Automation & Configuration,Cloud Custodian,https://github.com/cloud-custodian/cloud-custo...,"{'languages': [{'name': 'Python', 'value': 771...","{'accepted': '2020-06-25', 'dev_stats_url': 'h...",522,441.0,4951.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'cloud-custodian.svg', 'hash': '0...",cloud-custodian/cloud-custodian,cloud-custodian,cloud-custodian,Automation & Configuration


In [11]:
safe_set_index(df, idx_wanted=['relation', 'category', 'subpath', 'name'])

*** existing index found: ['category', 'subpath', 'relation', 'name'], resetting to avoid data loss prior to setting to: ['relation', 'category', 'subpath', 'name']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,index,path,repo_url,github_data,extra,commitsThisYear,contributorsCount,stars,headquarters,industries,image_data,repo,org_name,repo_name
relation,category,subpath,name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
graduated,App Definition and Development,Application Definition & Image Build,Helm,124,App Definition and Development / Application D...,https://github.com/helm/helm,"{'languages': [{'name': 'Go', 'value': 1807584...","{'accepted': '2018-06-01', 'incubating': '2018...",407,766.0,24953.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'helm.svg', 'hash': 'KhA6o4K6XUSq...",helm/helm,helm,helm
graduated,App Definition and Development,Continuous Integration & Delivery,Argo,139,App Definition and Development / Continuous In...,https://github.com/argoproj/argo-cd,"{'languages': [{'name': 'Go', 'value': 5524117...","{'accepted': '2020-03-26', 'incubating': '2020...",1338,1154.0,14145.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'argo.svg', 'hash': 'Xb4OvEH8Blm9...",argoproj/argo-cd,argoproj,argo-cd
graduated,App Definition and Development,Continuous Integration & Delivery,Flux,140,App Definition and Development / Continuous In...,https://github.com/fluxcd/flux2,"{'languages': [{'name': 'Go', 'value': 970748,...","{'accepted': '2019-07-15', 'incubating': '2021...",689,140.0,5300.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'flux.svg', 'hash': 'h7GovC0iLE2t...",fluxcd/flux2,fluxcd,flux2
graduated,App Definition and Development,Database,TiKV,110,App Definition and Development / Database,https://github.com/tikv/tikv,"{'languages': [{'name': 'Rust', 'value': 21095...","{'accepted': '2018-08-28', 'incubating': '2018...",690,412.0,13617.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'ti-kv.svg', 'hash': 'LgnxjW/bCbp...",tikv/tikv,tikv,tikv
graduated,App Definition and Development,Database,Vitess,111,App Definition and Development / Database,https://github.com/vitessio/vitess,"{'languages': [{'name': 'Go', 'value': 2680572...","{'accepted': '2018-02-05', 'incubating': '2018...",2283,379.0,16897.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'vitess.svg', 'hash': '/rYtR/SZOC...",vitessio/vitess,vitessio,vitess
graduated,Observability and Analysis,Logging,Fluentd,167,Observability and Analysis / Logging,https://github.com/fluent/fluentd,"{'languages': [{'name': 'Ruby', 'value': 32604...","{'accepted': '2016-11-08', 'incubating': '2016...",220,261.0,12199.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'fluentd.svg', 'hash': '4rqVszCNG...",fluent/fluentd,fluent,fluentd
graduated,Observability and Analysis,Monitoring,Prometheus,163,Observability and Analysis / Monitoring,https://github.com/prometheus/prometheus,"{'languages': [{'name': 'Go', 'value': 5659944...","{'accepted': '2016-05-09', 'incubating': '2016...",1135,973.0,49987.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'prometheus.svg', 'hash': 'I4EIFd...",prometheus/prometheus,prometheus,prometheus
graduated,Observability and Analysis,Tracing,Jaeger,168,Observability and Analysis / Tracing,https://github.com/jaegertracing/jaeger,"{'languages': [{'name': 'Go', 'value': 2890209...","{'accepted': '2017-09-13', 'incubating': '2017...",404,293.0,18318.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'jaeger.svg', 'hash': 'liGBqZr114...",jaegertracing/jaeger,jaegertracing,jaeger
graduated,Orchestration & Management,Coordination & Service Discovery,CoreDNS,92,Orchestration & Management / Coordination & Se...,https://github.com/coredns/coredns,"{'languages': [{'name': 'Go', 'value': 1675138...","{'accepted': '2017-02-27', 'incubating': '2018...",341,370.0,11071.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'core-dns.svg', 'hash': '/wdaTejm...",coredns/coredns,coredns,coredns
graduated,Orchestration & Management,Coordination & Service Discovery,etcd,93,Orchestration & Management / Coordination & Se...,https://github.com/etcd-io/etcd,"{'languages': [{'name': 'Go', 'value': 5046207...","{'accepted': '2018-12-11', 'incubating': '2018...",1761,968.0,44537.0,"San Francisco, California","[Cloud Computing, Cloud Infrastructure, Non Pr...","{'fileName': 'etcd.svg', 'hash': 'Qwqz65rvhOJl...",etcd-io/etcd,etcd-io,etcd


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 174 entries, ('graduated', 'App Definition and Development', ' Application Definition & Image Build', 'Helm') to ('sandbox', 'Wasm', ' Runtimes', 'WasmEdge (Wasm)')
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              174 non-null    int64  
 1   path               174 non-null    object 
 2   repo_url           174 non-null    object 
 3   github_data        174 non-null    object 
 4   extra              174 non-null    object 
 5   commitsThisYear    174 non-null    int64  
 6   contributorsCount  174 non-null    float64
 7   stars              174 non-null    float64
 8   headquarters       174 non-null    object 
 9   industries         174 non-null    object 
 10  image_data         174 non-null    object 
 11  repo               174 non-null    string 
 12  org_name           174 non-null    string 
 13  repo_name          174 non-

## Just the Observability Domain (for now)

In [None]:
# df_otag = df_mini.query("`category` == 'Observability and Analysis'").copy()

# # TODO: make this for all tags
# TAG = 'Observability and Analysis'

# df_otag = df_mini.loc[TAG, :, :].copy()
# df_otag.info()
# df_otag

## Generate full repo list (for each org, enum repos)

In [None]:
orgs = df_otag.org_name.drop_duplicates().tolist()
orgs

## Fetch project release data from GitHub API

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    # df = pd.DataFrame(columns=[
    #     'repo_name', 'release_name', 'release_date', 
    #     'language', 'release_notes'
    # ])

    df = pd.DataFrame(columns=['repo_name', 'release_name', 'release_date', 'language'])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    # Loop over repositories
    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        if repo_str in state['repos_done']:
            print(f"Skipping: {repo_str}")
            state['i'] += 1
            continue

        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        # Get all releases
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    #'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    #print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df

In [None]:
repos_by_relation

In [None]:
!mkdir -p out

In [None]:
def fetch_one(token, since_date, level, repos):

    json_file=f'out/cncf-{level}-github-releases.json' 
    csv_file=f'out/cncf-{level}-github-releases.csv'
    state_file=f'out/.nukeme_state_file_{level}'
    
    print(f"Fetching {len(repos)} repositories for {level} projects")
    
    releases = fetch_repo_data( token, 
                                repos, 
                                since=since_date,
                                json_file=json_file,
                                csv_file=csv_file)

## Get Release Info from the REST API

_note: graphql api doesn't support filtering by date_

In [None]:
token = os.environ['GITHUB_TOKEN']
since_date = datetime(2022, 11, 7, tzinfo=timezone.utc)

fetch_one(token, since_date, 'otag-graduated', graduated)

In [None]:
fetch_one(token, since_date, 'otag-incubating', incubating)

In [None]:
fetch_one(token, since_date, 'otag-sandbox', sandbox)

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

# def plot_releases_timeline(releases: pd.DataFrame):
#     fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

# def plot_releases_scatter_simple(releases: pd.DataFrame):
#     # Filter releases by year
#     releases_2023 = releases[releases['release_date'].dt.year == 2023]

#     # Create scatter plot
#     fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

def plot_releases_scatter(releases: pd.DataFrame, title: str=None):
    # # Filter releases by year
    # releases_2023 = releases[releases['release_date'].dt.year == 2023]

    if title is None:
        title = "GitHub Releases Timeline"

    # Group releases by organization
    releases['organization'] = releases['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases, x="release_date", y="repo_name", color="organization", symbol="language", title="Project Releases")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000)
                    #   height=2500,
                    #   )
    fig.show()

In [None]:
import pandas as pd
import os

def json_to_csv(json_file_path: str) -> None:
    """
    Load a JSON file into a pandas DataFrame and save it as a CSV file with the same name.
    """
    df = pd.read_json(json_file_path, lines=True)
    
    csv_file_path = os.path.splitext(json_file_path)[0] + '.csv'
    df.to_csv(csv_file_path, index=False)

In [None]:
for level in ['graduated', 'incubating', 'sandbox']:
    json_to_csv(f'out/cncf-{level}-github-releases.json')

In [None]:
def plot_releases_from_csv(csv_file: str, title: str) -> None:
    csv_file =f'out/cncf-{level}-github-releases.csv'
    if os.path.exists(csv_file):
        df_releases = pd.read_csv(csv_file)
        df_releases.release_date = pd.to_datetime(df_releases.release_date)
        
        plot_releases_scatter(df_releases, title)
    else:
        print(f"CSV file {csv_file} not found")

In [None]:
import pandas as pd
import os

for level in ['graduated', 'incubating', 'sandbox']:
    plot_releases_from_csv(f'out/cncf-{level}-github-releases.csv', f'Releases: {level}')


In [None]:
import pandas as pd
import os


# plot_releases_scatter(f'out/cncf-all-github-releases.csv')



In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo.to_csv('cncf_releases_by_repo.csv')