# Fetch all GitHub Release info for CNCF projects

In [1]:
#!wget https://landscape.cncf.io/data/items.json

In [2]:
%%bash 
# array of JSON --> JSONL
jq  -c '.[]' items.json > items.jsonl

ls -lahF items.jsonl
wc -l items.jsonl

-rw-r--r--  1 me  staff   9.7M Nov 14 11:02 items.jsonl
    2267 items.jsonl


In [3]:
%%bash
# landscape | select(CNCF Projects) --> cncf-projects.jsonl
jq -c 'select(.relation == "graduated" or .relation == "incubating" or .relation == "sandbox")' items.jsonl > cncf-projects.jsonl 

ls -lahF cncf-projects.jsonl
wc -l cncf-projects.jsonl

-rw-r--r--  1 me  staff   3.9M Nov 14 11:02 cncf-projects.jsonl
     178 cncf-projects.jsonl


## helpers

In [4]:
import pandas as pd

pd.set_option('display.max_rows', 512)
pd.set_option('display.max_columns', 512)
pd.set_option('display.width', 512)

# for PAT / token
from dotenv import load_dotenv
load_dotenv()

def safe_set_index(df:         pd.DataFrame, 
                   idx_wanted: list[str]) -> pd.DataFrame:

    # check to see if the index is already set, else, data loss as set_index can be destructive
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        if idx_existing:
            print(f'*** existing index found: {idx_existing}, resetting to avoid data loss prior to setting to: {idx_wanted}')
            df.reset_index(inplace=True)

        # note: index must be unique!
        df.set_index(idx_wanted, verify_integrity=True, inplace=True)
    
    df.sort_index(inplace=True)
    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_named'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # expand=True returns df which can rename columns on # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe
    
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)
    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

## Load the Landscape 

In [5]:
import os

file_path = './cncf-projects.jsonl'

if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    df = pd.read_json(file_path, lines=True)
else:
    print(f"File {file_path} does not exist or is empty.")

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      178 non-null    object 
 1   homepage_url              178 non-null    object 
 2   project                   178 non-null    object 
 3   repo_url                  176 non-null    object 
 4   logo                      178 non-null    object 
 5   twitter                   174 non-null    object 
 6   crunchbase                178 non-null    object 
 7   extra                     176 non-null    object 
 8   github_data               176 non-null    object 
 9   repos                     176 non-null    object 
 10  github_start_commit_data  176 non-null    object 
 11  image_data                178 non-null    object 
 12  firstCommitDate           176 non-null    object 
 13  firstCommitLink           176 non-null    object 
 14  latestComm

In [6]:
df.head(5)

Unnamed: 0,name,homepage_url,project,repo_url,logo,twitter,crunchbase,extra,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,path,landscape,category,amountKind,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,relation,isSubsidiaryProject,allow_duplicate_repo,project_org,joined,enduser,url_for_bestpractices,open_source,second_path
0,Akri,https://docs.akri.sh,sandbox,https://github.com/project-akri/akri,akri.svg,https://twitter.com/ProjectAkri,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2021-09-14', 'annual_review_date...","{'languages': [{'name': 'Rust', 'value': 10535...",[{'url': 'https://github.com/project-akri/akri...,{'start_commit_link': '/project-akri/akri/comm...,"{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",2020-10-14T00:42:19Z,https://github.com/project-akri/akri/commit/94...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/project-akri/akri/commit/98...,"{'text': '7 months ago', 'value': '900', 'orig...",https://github.com/project-akri/akri/releases,197,34.0,https://github.com/project-akri/akri/graphs/co...,Rust,996.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",A Kubernetes Resource Interface for the Edge,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/akri.svg,5339,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,996,True,$3M,akri,Akri,False,sandbox,False,,,,,,,
1,CDK for Kubernetes (CDK8s),https://cdk8s.io/,sandbox,https://github.com/cdk8s-team/cdk8s,cdk8s.svg,https://twitter.com/CloudNativeFdn,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2020-11-10', 'annual_review_url'...","{'languages': [{'name': 'JavaScript', 'value':...",[{'url': 'https://github.com/cdk8s-team/cdk8s'...,{'start_commit_link': '/cdk8s-team/cdk8s/commi...,"{'fileName': 'cdk-for-kubernetes-cdk8s.svg', '...",2019-07-25T12:45:22Z,https://github.com/cdk8s-team/cdk8s/commit/ec7...,"{'text': 'about a month', 'value': '960', 'ori...",https://github.com/cdk8s-team/cdk8s/commit/6b9...,2021-10-13T20:21:32Z,https://github.com/cdk8s-team/cdk8s/releases,434,78.0,https://github.com/cdk8s-team/cdk8s/graphs/con...,JavaScript,3818.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",CDK8s lets you define Kubernetes apps and comp...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/cdk-for-kubernetes-cdk8s.svg,0,,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,3818,True,$3M,cdk-for-kubernetes-cdk8s,CDK for Kubernetes (CDK8s),False,sandbox,False,,,,,,,
2,Cloud Custodian,https://cloudcustodian.io/,incubating,https://github.com/cloud-custodian/cloud-custo...,cloud-custodian.svg,https://twitter.com/CloudNativeFdn,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2020-06-25', 'dev_stats_url': 'h...","{'languages': [{'name': 'Python', 'value': 771...",[{'url': 'https://github.com/cloud-custodian/c...,{'start_commit_link': '/cloud-custodian/cloud-...,"{'fileName': 'cloud-custodian.svg', 'hash': '0...",2015-07-16T14:19:15Z,https://github.com/cloud-custodian/cloud-custo...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/cloud-custodian/cloud-custo...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/cloud-custodian/cloud-custo...,522,441.0,https://github.com/cloud-custodian/cloud-custo...,Python,4951.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...","Rules engine for cloud security, cost optimiza...",Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/cloud-custodian.svg,3402,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,4951,True,$3M,cloud-custodian,Cloud Custodian,False,incubating,False,,,,,,,
3,DevStream,https://www.devstream.io/,sandbox,https://github.com/devstream-io/devstream,devstream.svg,https://twitter.com/CloudNativeFdn,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2022-06-17', 'dev_stats_url': 'h...","{'languages': [{'name': 'Go', 'value': 31401, ...",[{'url': 'https://github.com/devstream-io/devs...,{'start_commit_link': '/devstream-io/devstream...,"{'fileName': 'dev-stream.svg', 'hash': 'R9wN8R...",2021-10-09T03:13:50Z,https://github.com/devstream-io/devstream/comm...,"{'text': '6 months ago', 'value': '910', 'orig...",https://github.com/devstream-io/devstream/comm...,"{'text': '6 months ago', 'value': '910', 'orig...",https://github.com/devstream-io/devstream/rele...,515,62.0,https://github.com/devstream-io/devstream/grap...,Go,815.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",DevStream: the open-source DevOps toolchain ma...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/dev-stream.svg,6202,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,815,True,$3M,dev-stream,DevStream,False,sandbox,False,,,,,,,
4,KCL,https://kcl-lang.io/,sandbox,https://github.com/kcl-lang/kcl,kcl.svg,https://twitter.com/kcl_language,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2023-09-20', 'dev_stats_url': 'h...","{'languages': [{'name': 'Rust', 'value': 28654...","[{'url': 'https://github.com/kcl-lang/kcl', 's...",{'start_commit_link': '/kcl-lang/kcl/commit/68...,"{'fileName': 'kcl.svg', 'hash': 'vzDONy6/3IQev...",2022-05-05T07:02:26Z,https://github.com/kcl-lang/kcl/commit/68d5103...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/kcl-lang/kcl/commit/eeff00e...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/kcl-lang/kcl/releases,329,15.0,https://github.com/kcl-lang/kcl/graphs/contrib...,Rust,808.0,Apache License 2.0,"San Francisco, California",,A constraint-based record & functional languag...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/kcl.svg,7867,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,808,True,$3M,kcl,KCL,False,sandbox,False,,,,,,,


In [7]:
# extract a clean org/repo
df['repo'] = df['repo_url'].astype('string').str.removeprefix('https://github.com/').dropna()
df = split_org_repo(df, 'repo')
df.head(1)

Unnamed: 0,name,homepage_url,project,repo_url,logo,twitter,crunchbase,extra,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,path,landscape,category,amountKind,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,relation,isSubsidiaryProject,allow_duplicate_repo,project_org,joined,enduser,url_for_bestpractices,open_source,second_path,repo,org_name,repo_name
0,Akri,https://docs.akri.sh,sandbox,https://github.com/project-akri/akri,akri.svg,https://twitter.com/ProjectAkri,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2021-09-14', 'annual_review_date...","{'languages': [{'name': 'Rust', 'value': 10535...",[{'url': 'https://github.com/project-akri/akri...,{'start_commit_link': '/project-akri/akri/comm...,"{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",2020-10-14T00:42:19Z,https://github.com/project-akri/akri/commit/94...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/project-akri/akri/commit/98...,"{'text': '7 months ago', 'value': '900', 'orig...",https://github.com/project-akri/akri/releases,197,34.0,https://github.com/project-akri/akri/graphs/co...,Rust,996.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",A Kubernetes Resource Interface for the Edge,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/akri.svg,5339,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,996,True,$3M,akri,Akri,False,sandbox,False,,,,,,,,project-akri/akri,project-akri,akri


In [9]:
# REINDEX
df['subpath'] = df['path']
safe_set_index(df, idx_wanted=['relation', 'category', 'path', 'name'])

*** existing index found: ['category', 'relation', 'path', 'name'], resetting to avoid data loss prior to setting to: ['relation', 'category', 'path', 'name']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,index,homepage_url,project,repo_url,logo,twitter,crunchbase,extra,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,landscape,amountKind,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,isSubsidiaryProject,allow_duplicate_repo,project_org,joined,enduser,url_for_bestpractices,open_source,second_path,repo,org_name,repo_name
relation,category,path,name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
graduated,App Definition and Development,App Definition and Development / Application Definition & Image Build,Helm,124,https://helm.sh/,graduated,https://github.com/helm/helm,helm.svg,https://twitter.com/helmpack,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2018-06-01', 'incubating': '2018...","{'languages': [{'name': 'Go', 'value': 1807584...","[{'url': 'https://github.com/helm/helm', 'star...",{'start_commit_link': '/helm/helm/commit/94db5...,"{'fileName': 'helm.svg', 'hash': 'KhA6o4K6XUSq...",2015-11-02T23:52:56Z,https://github.com/helm/helm/commit/94db53d080...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/helm/helm/commit/46265d8ee3...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/helm/helm/releases,407,766.0,https://github.com/helm/helm/graphs/contributors,Go,24953.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",The Kubernetes Package Manager,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,App Definition and Development / Application D...,funding,3000000.0,True,logos/helm.svg,3131,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,24953.0,True,$3M,helm,Helm,False,False,,,,,,,,helm/helm,helm,helm
graduated,App Definition and Development,App Definition and Development / Continuous Integration & Delivery,Argo,139,https://argoproj.github.io/,graduated,https://github.com/argoproj/argo-cd,argo.svg,https://twitter.com/argoproj,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2020-03-26', 'incubating': '2020...","{'languages': [{'name': 'Go', 'value': 5524117...",[{'url': 'https://github.com/argoproj/argo-cd'...,{'start_commit_link': '/argoproj/argo-cd/commi...,"{'fileName': 'argo.svg', 'hash': 'Xb4OvEH8Blm9...",2018-02-15T00:53:07Z,https://github.com/argoproj/argo-cd/commit/a67...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/argoproj/argo-cd/commit/03c...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/argoproj/argo-cd/releases,1338,1154.0,https://github.com/argoproj/argo-cd/graphs/con...,Go,14145.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...","Kubernetes-native tools to run workflows, mana...",Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,App Definition and Development / Continuous In...,funding,3000000.0,True,logos/argo.svg,4486,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,14145.0,True,$3M,argo,Argo,False,False,,,,,,,,argoproj/argo-cd,argoproj,argo-cd
graduated,App Definition and Development,App Definition and Development / Continuous Integration & Delivery,Flux,140,https://fluxcd.io/,graduated,https://github.com/fluxcd/flux2,flux.svg,https://twitter.com/fluxcd,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2019-07-15', 'incubating': '2021...","{'languages': [{'name': 'Go', 'value': 970748,...","[{'url': 'https://github.com/fluxcd/flux2', 's...",{'start_commit_link': '/fluxcd/flux2/commit/eb...,"{'fileName': 'flux.svg', 'hash': 'h7GovC0iLE2t...",2020-04-24T09:38:22Z,https://github.com/fluxcd/flux2/commit/ebdabaf...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/fluxcd/flux2/commit/d3eacd4...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/fluxcd/flux2/releases,689,140.0,https://github.com/fluxcd/flux2/graphs/contrib...,Go,5300.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",Open and extensible continuous delivery soluti...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,App Definition and Development / Continuous In...,funding,3000000.0,True,logos/flux.svg,4782,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,5300.0,True,$3M,flux,Flux,False,False,,,,,,,,fluxcd/flux2,fluxcd,flux2
graduated,App Definition and Development,App Definition and Development / Database,TiKV,110,https://tikv.org,graduated,https://github.com/tikv/tikv,tikv.svg,https://twitter.com/tikvproject,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2018-08-28', 'incubating': '2018...","{'languages': [{'name': 'Rust', 'value': 21095...","[{'url': 'https://github.com/tikv/tikv', 'star...",{'start_commit_link': '/tikv/tikv/commit/b56db...,"{'fileName': 'ti-kv.svg', 'hash': 'LgnxjW/bCbp...",2016-01-07T05:02:04Z,https://github.com/tikv/tikv/commit/b56dbe686b...,"{'text': 'about a month', 'value': '960', 'ori...",https://github.com/tikv/tikv/commit/384aaeb381...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/tikv/tikv/releases,690,412.0,https://github.com/tikv/tikv/graphs/contributors,Rust,13617.0,Apache License 2.0,"San Francisco, California",2022-11-17T19:02:00.000Z,A distributed transactional key-value database...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,App Definition and Development / Database,funding,3000000.0,True,logos/ti-kv.svg,2574,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,13617.0,True,$3M,ti-kv,TiKV,False,False,,,,,,,,tikv/tikv,tikv,tikv
graduated,App Definition and Development,App Definition and Development / Database,Vitess,111,https://vitess.io/,graduated,https://github.com/vitessio/vitess,vitess.svg,https://twitter.com/vitessio,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2018-02-05', 'incubating': '2018...","{'languages': [{'name': 'Go', 'value': 2680572...","[{'url': 'https://github.com/vitessio/vitess',...",{'start_commit_link': '/vitessio/vitess/commit...,"{'fileName': 'vitess.svg', 'hash': '/rYtR/SZOC...",2012-02-25T07:19:43Z,https://github.com/vitessio/vitess/commit/8d48...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/vitessio/vitess/commit/5b65...,"{'text': '3 months ago', 'value': '940', 'orig...",https://github.com/vitessio/vitess/releases,2283,379.0,https://github.com/vitessio/vitess/graphs/cont...,Go,16897.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...","MySQL-compatible, horizontally scalable, cloud...",Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,App Definition and Development / Database,funding,3000000.0,True,logos/vitess.svg,1724,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,16897.0,True,$3M,vitess,Vitess,False,False,,,,,,,,vitessio/vitess,vitessio,vitess
graduated,Observability and Analysis,Observability and Analysis / Logging,Fluentd,167,https://www.fluentd.org/,graduated,https://github.com/fluent/fluentd,fluentd.svg,https://twitter.com/fluentd,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2016-11-08', 'incubating': '2016...","{'languages': [{'name': 'Ruby', 'value': 32604...","[{'url': 'https://github.com/fluent/fluentd', ...",{'start_commit_link': '/fluent/fluentd/commit/...,"{'fileName': 'fluentd.svg', 'hash': '4rqVszCNG...",2011-06-18T22:36:35Z,https://github.com/fluent/fluentd/commit/5c14d...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/fluent/fluentd/commit/78c91...,"{'text': '4 months ago', 'value': '930', 'orig...",https://github.com/fluent/fluentd/releases,220,261.0,https://github.com/fluent/fluentd/graphs/contr...,Ruby,12199.0,Apache License 2.0,"San Francisco, California",2022-10-18T02:26:16.000Z,Fluentd: Unified Logging Layer (project under ...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Observability and Analysis / Logging,funding,3000000.0,True,logos/fluentd.svg,1189,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,12199.0,True,$3M,fluentd,Fluentd,False,False,,,,,,,,fluent/fluentd,fluent,fluentd
graduated,Observability and Analysis,Observability and Analysis / Monitoring,Prometheus,163,https://prometheus.io/,graduated,https://github.com/prometheus/prometheus,prometheus.svg,https://twitter.com/PrometheusIO,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2016-05-09', 'incubating': '2016...","{'languages': [{'name': 'Go', 'value': 5659944...",[{'url': 'https://github.com/prometheus/promet...,{'start_commit_link': '/prometheus/prometheus/...,"{'fileName': 'prometheus.svg', 'hash': 'I4EIFd...",2012-11-24T11:14:12Z,https://github.com/prometheus/prometheus/commi...,"{'text': 'about a month', 'value': '960', 'ori...",https://github.com/prometheus/prometheus/commi...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/prometheus/prometheus/releases,1135,973.0,https://github.com/prometheus/prometheus/graph...,Go,49987.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",The Prometheus monitoring system and time seri...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Observability and Analysis / Monitoring,funding,3000000.0,True,logos/prometheus.svg,486,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,49987.0,True,$3M,prometheus,Prometheus,False,False,,,,,,,,prometheus/prometheus,prometheus,prometheus
graduated,Observability and Analysis,Observability and Analysis / Tracing,Jaeger,168,https://www.jaegertracing.io/,graduated,https://github.com/jaegertracing/jaeger,jaeger.svg,https://twitter.com/JaegerTracing,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2017-09-13', 'incubating': '2017...","{'languages': [{'name': 'Go', 'value': 2890209...",[{'url': 'https://github.com/jaegertracing/jae...,{'start_commit_link': '/jaegertracing/jaeger/c...,"{'fileName': 'jaeger.svg', 'hash': 'liGBqZr114...",2016-04-18T22:22:11Z,https://github.com/jaegertracing/jaeger/commit...,"{'text': 'about a month', 'value': '960', 'ori...",https://github.com/jaegertracing/jaeger/commit...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/jaegertracing/jaeger/releases,404,293.0,https://github.com/jaegertracing/jaeger/graphs...,Go,18318.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...","CNCF Jaeger, a Distributed Tracing Platform",Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Observability and Analysis / Tracing,funding,3000000.0,True,logos/jaeger.svg,1273,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,18318.0,True,$3M,jaeger,Jaeger,False,False,,,,,,,,jaegertracing/jaeger,jaegertracing,jaeger
graduated,Orchestration & Management,Orchestration & Management / Coordination & Service Discovery,CoreDNS,92,https://coredns.io/,graduated,https://github.com/coredns/coredns,core-dns.svg,https://twitter.com/corednsio,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2017-02-27', 'incubating': '2018...","{'languages': [{'name': 'Go', 'value': 1675138...","[{'url': 'https://github.com/coredns/coredns',...",{'start_commit_link': '/coredns/coredns/commit...,"{'fileName': 'core-dns.svg', 'hash': '/wdaTejm...",2016-03-18T20:57:35Z,https://github.com/coredns/coredns/commit/3ec0...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/coredns/coredns/commit/4592...,"{'text': '3 months ago', 'value': '940', 'orig...",https://github.com/coredns/coredns/releases,341,370.0,https://github.com/coredns/coredns/graphs/cont...,Go,11071.0,Apache License 2.0,"San Francisco, California",2021-10-08T17:32:39.000Z,CoreDNS is a DNS server that chains plugins,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Orchestration & Management / Coordination & Se...,funding,3000000.0,True,logos/core-dns.svg,1250,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,11071.0,True,$3M,core-dns,CoreDNS,False,False,,,,,,,,coredns/coredns,coredns,coredns
graduated,Orchestration & Management,Orchestration & Management / Coordination & Service Discovery,etcd,93,https://etcd.io/,graduated,https://github.com/etcd-io/etcd,etcd.svg,https://twitter.com/etcdio,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2018-12-11', 'incubating': '2018...","{'languages': [{'name': 'Go', 'value': 5046207...","[{'url': 'https://github.com/etcd-io/etcd', 's...",{'start_commit_link': '/etcd-io/etcd/commit/20...,"{'fileName': 'etcd.svg', 'hash': 'Qwqz65rvhOJl...",2013-06-07T00:43:32Z,https://github.com/etcd-io/etcd/commit/20ca21a...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/etcd-io/etcd/commit/e1ebc26...,"{'text': '4 months ago', 'value': '930', 'orig...",https://github.com/etcd-io/etcd/releases,1761,968.0,https://github.com/etcd-io/etcd/graphs/contrib...,Go,44537.0,Apache License 2.0,"San Francisco, California","{'text': '10 months ago', 'value': '870', 'ori...",Distributed reliable key-value store for the m...,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Orchestration & Management / Coordination & Se...,funding,3000000.0,True,logos/etcd.svg,3192,100.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,44537.0,True,$3M,etcd,etcd,False,False,,,,,,,,etcd-io/etcd,etcd-io,etcd


In [None]:
df.shape
df.info()

In [None]:
# df2=df[['B','D','F']].rename({'B':'X','D':'Y','F':'Z'}, axis=1)

# TODO there's a lot of duplicative stuff, trim, for now too small to matter
cols_mini = ['relation',
             'org_name', 
             'repo_name', 
             'repo', 
             'name',
             'category', 
             'path', 
             'repo_url',
             'github_data',
             'extra',
             'language',
             'industries',
             'contributorsCount']

df_mini = df.reset_index()[cols_mini]
safe_set_index(df_mini, idx_wanted=['category', 'relation', 'name'])

df_mini.info()
df_mini

## Just the Observability Domain (for now)

In [None]:
# df_otag = df_mini.query("`category` == 'Observability and Analysis'").copy()

# TODO: make this for all tags
TAG = 'Observability and Analysis'

df_otag = df_mini.loc[TAG, :, :].copy()
df_otag.info()
df_otag

## Generate (OTAG) full repo list

In [None]:
orgs = df_otag.org_name.drop_duplicates().tolist()
orgs

In [None]:
# get_repos(type: Opt[str] = NotSet, sort: Opt[str] = NotSet, direction: Opt[str] = NotSet) → PaginatedList[Repository]¶
# Calls
# GET /orgs/{org}/repos

# Parameters
# type – string (‘all’, ‘public’, ‘private’, ‘forks’, ‘sources’, ‘member’)

# sort – string (‘created’, ‘updated’, ‘pushed’, ‘full_name’)

# direction – string (‘asc’, desc’)

for o in orgs:
    # get a list of all public repositories in each org.
    



## Fetch project release data from GitHub API

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    # df = pd.DataFrame(columns=[
    #     'repo_name', 'release_name', 'release_date', 
    #     'language', 'release_notes'
    # ])

    df = pd.DataFrame(columns=['repo_name', 'release_name', 'release_date', 'language'])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    # Loop over repositories
    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        if repo_str in state['repos_done']:
            print(f"Skipping: {repo_str}")
            state['i'] += 1
            continue

        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        # Get all releases
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    #'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    #print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_org_data(token: str, 
                   org_list: List[str], 
                   since: datetime=None, 
                   json_file: str=None, 
                   csv_file: str=None, 
                   state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    # df = pd.DataFrame(columns=[
    #     'org_name', 'release_name', 'release_date', 
    #     'language', 'release_notes'
    # ])

    df = pd.DataFrame(columns=['org_name', 'release_name', 'release_date', 'language'])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'orgs_done': []}
    else:
        state = {'i': 0, 'orgs_done': []}

    # Loop over orgs
    while state['i'] < len(org_list):
        org_str = org_list[state['i']]

        if org_str in state['orgs_done']:
            print(f"Skipping: {org_str}")
            state['i'] += 1
            continue

        while True:
            try:
                org = g.get_org(org_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Org {org_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting org {org_str}: {e}")
                    break

        if not org:
            state['i'] += 1
            continue

        # releases = org.get_releases()
        # language = org.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'org_name': [org_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    #'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {org_str}::{release.title}  ")

        # Save state
        if state_file:
            state[orgs_done'].append(org_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    #print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df

In [None]:
otag_repos_by_relation

In [None]:
!mkdir -p out

In [None]:
def fetch_one(token, since_date, level, repos):

    json_file=f'out/cncf-{level}-github-releases.json' 
    csv_file=f'out/cncf-{level}-github-releases.csv'
    state_file=f'out/.nukeme_state_file_{level}'
    
    print(f"Fetching {len(repos)} repositories for {level} projects")
    
    releases = fetch_repo_data( token, 
                                repos, 
                                since=since_date,
                                json_file=json_file,
                                csv_file=csv_file)

In [None]:
token = os.environ['GITHUB_TOKEN']
since_date = datetime(2022, 11, 7, tzinfo=timezone.utc)

fetch_one(token, since_date, 'otag-graduated', graduated)

In [None]:
fetch_one(token, since_date, 'otag-incubating', incubating)

In [None]:
fetch_one(token, since_date, 'otag-sandbox', sandbox)

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

# def plot_releases_timeline(releases: pd.DataFrame):
#     fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

# def plot_releases_scatter_simple(releases: pd.DataFrame):
#     # Filter releases by year
#     releases_2023 = releases[releases['release_date'].dt.year == 2023]

#     # Create scatter plot
#     fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

def plot_releases_scatter(releases: pd.DataFrame, title: str=None):
    # # Filter releases by year
    # releases_2023 = releases[releases['release_date'].dt.year == 2023]

    if title is None:
        title = "GitHub Releases Timeline"

    # Group releases by organization
    releases['organization'] = releases['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases, x="release_date", y="repo_name", color="organization", symbol="language", title="Project Releases")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000)
                    #   height=2500,
                    #   )
    fig.show()

In [None]:
import pandas as pd
import os

def json_to_csv(json_file_path: str) -> None:
    """
    Load a JSON file into a pandas DataFrame and save it as a CSV file with the same name.
    """
    df = pd.read_json(json_file_path, lines=True)
    
    csv_file_path = os.path.splitext(json_file_path)[0] + '.csv'
    df.to_csv(csv_file_path, index=False)

In [None]:
for level in ['graduated', 'incubating', 'sandbox']:
    json_to_csv(f'out/cncf-{level}-github-releases.json')

In [None]:
def plot_releases_from_csv(csv_file: str, title: str) -> None:
    csv_file =f'out/cncf-{level}-github-releases.csv'
    if os.path.exists(csv_file):
        df_releases = pd.read_csv(csv_file)
        df_releases.release_date = pd.to_datetime(df_releases.release_date)
        
        plot_releases_scatter(df_releases, title)
    else:
        print(f"CSV file {csv_file} not found")

In [None]:
import pandas as pd
import os

for level in ['graduated', 'incubating', 'sandbox']:
    plot_releases_from_csv(f'out/cncf-{level}-github-releases.csv', f'Releases: {level}')


In [None]:
import pandas as pd
import os


# plot_releases_scatter(f'out/cncf-all-github-releases.csv')



In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo.to_csv('cncf_releases_by_repo.csv')