# Fetch all cncf/landscape current full info (items.json)

In [1]:
#!wget https://landscape.cncf.io/data/items.json

In [2]:
%%bash 
# array of JSON --> JSONL
jq  -c '.[]' items.json > items.jsonl

ls -lahF items.jsonl && wc -l items.jsonl

In [3]:
%%bash
# landscape | select(CNCF Projects) --> cncf-projects.jsonl
jq -c 'select(.relation == "graduated" or .relation == "incubating" or .relation == "sandbox")' items.jsonl > cncf-projects.jsonl 

ls -lahF cncf-projects.jsonl
wc -l cncf-projects.jsonl

-rw-r--r--  1 me  staff   9.7M Nov  8 03:37 items.jsonl
    2267 items.jsonl
-rw-r--r--  1 me  staff   3.9M Nov  8 03:37 cncf-projects.jsonl
     178 cncf-projects.jsonl


In [3]:
%%bash
# create list of urls
jq -c '.repo_url' cncf-projects.jsonl | tee repos.txt

ls -lahF cncf-projects.jsonl
wc -l cncf-projects.jsonl

-rw-r--r--  1 me  staff   9.7M Nov  8 03:37 items.jsonl
    2267 items.jsonl
-rw-r--r--  1 me  staff   3.9M Nov  8 03:37 cncf-projects.jsonl
     178 cncf-projects.jsonl


In [4]:
# for PAT / token
from dotenv import load_dotenv
load_dotenv()

True

## helpers

In [5]:
import pandas as pd

pd.set_option('display.max_rows', 512)
pd.set_option('display.max_columns', 512)
pd.set_option('display.width', 512)

def safe_set_index(df:         pd.DataFrame, 
                   idx_wanted: list[str]) -> pd.DataFrame:

    # check to see if the index is already set, else, data loss as set_index can be destructive
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        df.set_index(idx_wanted, verify_integrity=True, inplace=True) # note: index must be unique!
        df.sort_index(inplace=True)
    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_named'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # df['tmp'] = df[colname].copy()
    # df_newcols = df['tmp'].str.split(pat='/', n=1, expand=True)

    # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe/
    # expand=True returns a dataframe  which we can rename columns on
    
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)
    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

## Load the Landscape 

In [6]:
import os
import pandas as pd

file_path = './cncf-projects.jsonl'

if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    df = pd.read_json(file_path, lines=True)
else:
    print(f"File {file_path} does not exist or is empty.")

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      178 non-null    object 
 1   homepage_url              178 non-null    object 
 2   project                   178 non-null    object 
 3   repo_url                  176 non-null    object 
 4   logo                      178 non-null    object 
 5   twitter                   174 non-null    object 
 6   crunchbase                178 non-null    object 
 7   extra                     176 non-null    object 
 8   github_data               176 non-null    object 
 9   repos                     176 non-null    object 
 10  github_start_commit_data  176 non-null    object 
 11  image_data                178 non-null    object 
 12  firstCommitDate           176 non-null    object 
 13  firstCommitLink           176 non-null    object 
 14  latestComm

In [7]:
df.head(1)

Unnamed: 0,name,homepage_url,project,repo_url,logo,twitter,crunchbase,extra,github_data,repos,github_start_commit_data,image_data,firstCommitDate,firstCommitLink,latestCommitDate,latestCommitLink,releaseDate,releaseLink,commitsThisYear,contributorsCount,contributorsLink,language,stars,license,headquarters,latestTweetDate,description,organization,crunchbaseData,path,landscape,category,amountKind,amount,oss,href,bestPracticeBadgeId,bestPracticePercentage,industries,starsPresent,starsAsText,marketCapPresent,marketCapAsText,id,flatName,member,relation,isSubsidiaryProject,allow_duplicate_repo,project_org,joined,enduser,url_for_bestpractices,open_source,second_path
0,Akri,https://docs.akri.sh,sandbox,https://github.com/project-akri/akri,akri.svg,https://twitter.com/ProjectAkri,https://www.crunchbase.com/organization/cloud-...,"{'accepted': '2021-09-14', 'annual_review_date...","{'languages': [{'name': 'Rust', 'value': 10535...",[{'url': 'https://github.com/project-akri/akri...,{'start_commit_link': '/project-akri/akri/comm...,"{'fileName': 'akri.svg', 'hash': '2nK42JQaM8qF...",2020-10-14T00:42:19Z,https://github.com/project-akri/akri/commit/94...,"{'text': '2 months ago', 'value': '950', 'orig...",https://github.com/project-akri/akri/commit/98...,"{'text': '7 months ago', 'value': '900', 'orig...",https://github.com/project-akri/akri/releases,197,34.0,https://github.com/project-akri/akri/graphs/co...,Rust,996.0,Apache License 2.0,"San Francisco, California","{'text': '7 months ago', 'value': '900', 'orig...",A Kubernetes Resource Interface for the Edge,Cloud Native Computing Foundation (CNCF),{'name': 'Cloud Native Computing Foundation (C...,Provisioning / Automation & Configuration,Provisioning / Automation & Configuration,Provisioning,funding,3000000,True,logos/akri.svg,5339,99.0,"[Cloud Computing, Cloud Infrastructure, Non Pr...",True,996,True,$3M,akri,Akri,False,sandbox,False,,,,,,,


In [8]:
# df_landscape = safe_set_index(df, idx_wanted=['relation','category','name'])
# df_landscape

In [18]:
# df2=df[['B','D','F']].rename({'B':'X','D':'Y','F':'Z'}, axis=1)

df_tiny=df[['relation', 
            'category', 
            'path', 
            'name',
            'repo_url']].copy()

df_tiny['repo'] = df_tiny['repo_url'].astype('string').str.removeprefix('https://github.com/')
df_tiny = split_org_repo(df_tiny, 'repo').dropna()
df_tiny.head(20)

Unnamed: 0,relation,category,path,name,repo_url,repo,org_name,repo_name
0,sandbox,Provisioning,Provisioning / Automation & Configuration,Akri,https://github.com/project-akri/akri,project-akri/akri,project-akri,akri
1,sandbox,Provisioning,Provisioning / Automation & Configuration,CDK for Kubernetes (CDK8s),https://github.com/cdk8s-team/cdk8s,cdk8s-team/cdk8s,cdk8s-team,cdk8s
2,incubating,Provisioning,Provisioning / Automation & Configuration,Cloud Custodian,https://github.com/cloud-custodian/cloud-custo...,cloud-custodian/cloud-custodian,cloud-custodian,cloud-custodian
3,sandbox,Provisioning,Provisioning / Automation & Configuration,DevStream,https://github.com/devstream-io/devstream,devstream-io/devstream,devstream-io,devstream
4,sandbox,Provisioning,Provisioning / Automation & Configuration,KCL,https://github.com/kcl-lang/kcl,kcl-lang/kcl,kcl-lang,kcl
5,sandbox,Provisioning,Provisioning / Automation & Configuration,kpt,https://github.com/GoogleContainerTools/kpt,GoogleContainerTools/kpt,GoogleContainerTools,kpt
6,sandbox,Provisioning,Provisioning / Automation & Configuration,KubeDL,https://github.com/kubedl-io/kubedl,kubedl-io/kubedl,kubedl-io,kubedl
7,incubating,Provisioning,Provisioning / Automation & Configuration,KubeEdge,https://github.com/kubeedge/kubeedge,kubeedge/kubeedge,kubeedge,kubeedge
8,sandbox,Provisioning,Provisioning / Automation & Configuration,Meshery,https://github.com/meshery/meshery,meshery/meshery,meshery,meshery
9,sandbox,Provisioning,Provisioning / Automation & Configuration,Metal³,https://github.com/metal3-io/baremetal-operator,metal3-io/baremetal-operator,metal3-io,baremetal-operator


In [23]:
cats = df_tiny['category'].drop_duplicates().tolist()
cats


['Provisioning',
 'Runtime',
 'Orchestration & Management',
 'App Definition and Development',
 'Platform',
 'Serverless',
 'Observability and Analysis',
 'Wasm']

In [32]:
# Visual Python: Data Analysis > Data Info
display(df_tiny)

Unnamed: 0,relation,category,path,name,repo_url,repo,org_name,repo_name
0,sandbox,Provisioning,Provisioning / Automation & Configuration,Akri,https://github.com/project-akri/akri,project-akri/akri,project-akri,akri
1,sandbox,Provisioning,Provisioning / Automation & Configuration,CDK for Kubernetes (CDK8s),https://github.com/cdk8s-team/cdk8s,cdk8s-team/cdk8s,cdk8s-team,cdk8s
2,incubating,Provisioning,Provisioning / Automation & Configuration,Cloud Custodian,https://github.com/cloud-custodian/cloud-custo...,cloud-custodian/cloud-custodian,cloud-custodian,cloud-custodian
3,sandbox,Provisioning,Provisioning / Automation & Configuration,DevStream,https://github.com/devstream-io/devstream,devstream-io/devstream,devstream-io,devstream
4,sandbox,Provisioning,Provisioning / Automation & Configuration,KCL,https://github.com/kcl-lang/kcl,kcl-lang/kcl,kcl-lang,kcl
5,sandbox,Provisioning,Provisioning / Automation & Configuration,kpt,https://github.com/GoogleContainerTools/kpt,GoogleContainerTools/kpt,GoogleContainerTools,kpt
6,sandbox,Provisioning,Provisioning / Automation & Configuration,KubeDL,https://github.com/kubedl-io/kubedl,kubedl-io/kubedl,kubedl-io,kubedl
7,incubating,Provisioning,Provisioning / Automation & Configuration,KubeEdge,https://github.com/kubeedge/kubeedge,kubeedge/kubeedge,kubeedge,kubeedge
8,sandbox,Provisioning,Provisioning / Automation & Configuration,Meshery,https://github.com/meshery/meshery,meshery/meshery,meshery,meshery
9,sandbox,Provisioning,Provisioning / Automation & Configuration,Metal³,https://github.com/metal3-io/baremetal-operator,metal3-io/baremetal-operator,metal3-io,baremetal-operator


In [88]:
# Visual Python: Data Analysis > Subset
otag_repos_by_relation = df_tiny.query("`category` == 'Observability and Analysis'")[['relation','repo', 'name']].copy()
otag_repos_by_relation.groupby('relation')['repo'].agg(lambda x: list(x)).to_dict()

safe_set_index(otag_repos_by_relation, idx_wanted=['relation', 'name'])
otag_repos_by_relation.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,repo
relation,name,Unnamed: 2_level_1
graduated,Fluentd,fluent/fluentd
graduated,Jaeger,jaegertracing/jaeger
graduated,Prometheus,prometheus/prometheus
incubating,Chaos Mesh,chaos-mesh/chaos-mesh
incubating,Cortex,cortexproject/cortex
incubating,Litmus,litmuschaos/litmus
incubating,OpenMetrics,OpenObservability/OpenMetrics
incubating,OpenTelemetry,open-telemetry/opentelemetry-java
incubating,Thanos,thanos-io/thanos
sandbox,Chaosblade,chaosblade-io/chaosblade


In [96]:
graduated  = otag_repos_by_relation.loc['graduated', :]['repo'].tolist()
incubating = otag_repos_by_relation.loc['incubating', :]['repo'].tolist()
sandbox    = otag_repos_by_relation.loc['sandbox', :]['repo'].tolist()

display(graduated, incubating, sandbox)


['fluent/fluentd', 'jaegertracing/jaeger', 'prometheus/prometheus']

['chaos-mesh/chaos-mesh',
 'cortexproject/cortex',
 'litmuschaos/litmus',
 'OpenObservability/OpenMetrics',
 'open-telemetry/opentelemetry-java',
 'thanos-io/thanos']

['chaosblade-io/chaosblade',
 'foniod/foniod',
 'headlamp-k8s/headlamp',
 'inspektor-gadget/inspektor-gadget',
 'sustainable-computing-io/kepler',
 'kuberhealthy/kuberhealthy',
 'opencost/opencost',
 'pixie-io/pixie',
 'skooner-k8s/skooner',
 'trickstercache/trickster']

## Fetch project release data from GitHub API

In [99]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    # df = pd.DataFrame(columns=[
    #     'repo_name', 'release_name', 'release_date', 
    #     'language', 'release_notes'
    # ])

    df = pd.DataFrame(columns=['repo_name', 'release_name', 'release_date', 'language'])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    # Loop over repositories
    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        if repo_str in state['repos_done']:
            print(f"Skipping: {repo_str}")
            state['i'] += 1
            continue

        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        # Get all releases
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    #'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    #print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df

In [101]:
otag_repos_by_relation

Unnamed: 0_level_0,Unnamed: 1_level_0,repo
relation,name,Unnamed: 2_level_1
graduated,Fluentd,fluent/fluentd
graduated,Jaeger,jaegertracing/jaeger
graduated,Prometheus,prometheus/prometheus
incubating,Chaos Mesh,chaos-mesh/chaos-mesh
incubating,Cortex,cortexproject/cortex
incubating,Litmus,litmuschaos/litmus
incubating,OpenMetrics,OpenObservability/OpenMetrics
incubating,OpenTelemetry,open-telemetry/opentelemetry-java
incubating,Thanos,thanos-io/thanos
sandbox,Chaosblade,chaosblade-io/chaosblade


In [109]:
!mkdir -p out

In [107]:
def fetch_one(token, since_date, level, repos):

    json_file=f'out/cncf-{level}-github-releases.json' 
    csv_file=f'out/cncf-{level}-github-releases.csv'
    state_file=f'out/.nukeme_state_file_{level}'
    
    print(f"Fetching {len(repos)} repositories for {level} projects")
    
    releases = fetch_repo_data( token, 
                                repos, 
                                since=since_date,
                                json_file=json_file,
                                csv_file=csv_file)

In [110]:
token = os.environ['GITHUB_TOKEN']
since_date = datetime(2022, 11, 7, tzinfo=timezone.utc)

fetch_one(token, since_date, 'otag-graduated', graduated)

Fetching 3 repositories for otag-graduated projects
Added 2023-07-14 08:27:02+00:00, fluent/fluentd::Fluentd v1.16.2  
Added 2023-04-17 07:51:15+00:00, fluent/fluentd::Fluentd v1.16.1  
Added 2023-03-29 04:04:47+00:00, fluent/fluentd::Fluentd v1.16.0  
Added 2023-11-02 23:58:27+00:00, jaegertracing/jaeger::Release v1.51.0  
Added 2023-10-06 16:26:16+00:00, jaegertracing/jaeger::Release v1.50.0  
Added 2023-09-07 14:07:29+00:00, jaegertracing/jaeger::Release v1.49.0  
Added 2023-08-15 11:39:36+00:00, jaegertracing/jaeger::Release 1.48.0  
Added 2023-07-07 01:22:45+00:00, jaegertracing/jaeger::Release 1.47.0  
Added 2023-06-05 02:47:29+00:00, jaegertracing/jaeger::Release v1.46.0  
Added 2023-05-05 21:02:00+00:00, jaegertracing/jaeger::Release v1.45.0  
Added 2023-04-10 19:33:37+00:00, jaegertracing/jaeger::Release v1.44.0  
Added 2023-03-15 18:09:45+00:00, jaegertracing/jaeger::Release 1.43.0  
Added 2023-02-05 21:06:06+00:00, jaegertracing/jaeger::Release v1.42.0  
Added 2023-01-04 06:

In [111]:
fetch_one(token, since_date, 'otag-incubating', incubating)

Fetching 6 repositories for otag-incubating projects
Added 2023-08-23 06:40:12+00:00, chaos-mesh/chaos-mesh::v2.6.2  
Added 2023-06-28 07:54:06+00:00, chaos-mesh/chaos-mesh::v2.6.1  
Added 2023-05-31 03:52:21+00:00, chaos-mesh/chaos-mesh::v2.6.0  
Added 2023-04-13 08:48:24+00:00, chaos-mesh/chaos-mesh::v2.5.2  
Added 2023-01-14 02:32:58+00:00, chaos-mesh/chaos-mesh::v2.5.1  
Added 2022-11-22 11:35:17+00:00, chaos-mesh/chaos-mesh::v2.5.0  
Added 2022-11-21 11:57:01+00:00, chaos-mesh/chaos-mesh::v2.5.0-beta.1  
Added 2022-11-21 11:36:52+00:00, chaos-mesh/chaos-mesh::v2.5.0-beta.0  
Added 2022-11-20 08:20:18+00:00, chaos-mesh/chaos-mesh::v2.4.3  
Added 2022-11-11 11:42:10+00:00, chaos-mesh/chaos-mesh::v2.5.0-alpha.1  
Added 2022-11-10 08:51:07+00:00, chaos-mesh/chaos-mesh::v2.5.0-alpha.0  
Added 2022-11-07 08:19:38+00:00, chaos-mesh/chaos-mesh::v2.4.2  
Added 2022-11-07 08:13:08+00:00, chaos-mesh/chaos-mesh::v2.3.3  
Added 2023-06-23 04:38:47+00:00, cortexproject/cortex::Cortex 1.15.3  
A

In [112]:
fetch_one(token, since_date, 'otag-sandbox', sandbox)

Fetching 10 repositories for otag-sandbox projects
Added 2023-09-15 03:25:14+00:00, chaosblade-io/chaosblade::v1.7.3  
Added 2023-05-18 14:12:20+00:00, chaosblade-io/chaosblade::v1.7.2  
Added 2022-12-15 11:30:23+00:00, chaosblade-io/chaosblade::v1.7.1  
Added 2023-11-07 23:18:14+00:00, headlamp-k8s/headlamp::0.21.0  
Added 2023-10-09 10:52:31+00:00, headlamp-k8s/headlamp::headlamp-helm-0.16.0  
Added 2023-10-05 20:30:37+00:00, headlamp-k8s/headlamp::0.20.1  
Added 2023-09-28 20:52:50+00:00, headlamp-k8s/headlamp::0.20.0  
Added 2023-08-25 16:44:48+00:00, headlamp-k8s/headlamp::headlamp-helm-0.15.0  
Added 2023-08-24 16:48:37+00:00, headlamp-k8s/headlamp::0.19.1  
Added 2023-08-03 14:00:16+00:00, headlamp-k8s/headlamp::0.19.0  
Added 2023-08-03 19:49:14+00:00, headlamp-k8s/headlamp::headlamp-helm-0.14.0  
Added 2023-07-06 15:58:44+00:00, headlamp-k8s/headlamp::headlamp-helm-0.13.0  
Added 2023-06-15 21:17:48+00:00, headlamp-k8s/headlamp::0.18.0  
Added 2023-05-29 10:42:08+00:00, headla

## Visualize Releases

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

# def plot_releases_timeline(releases: pd.DataFrame):
#     fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

# def plot_releases_scatter_simple(releases: pd.DataFrame):
#     # Filter releases by year
#     releases_2023 = releases[releases['release_date'].dt.year == 2023]

#     # Create scatter plot
#     fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

def plot_releases_scatter(releases: pd.DataFrame, title: str=None):
    # # Filter releases by year
    # releases_2023 = releases[releases['release_date'].dt.year == 2023]

    if title is None:
        title = "GitHub Releases Timeline"

    # Group releases by organization
    releases['organization'] = releases['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases, x="release_date", y="repo_name", color="organization", symbol="language", title="Project Releases")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000)
                    #   height=2500,
                    #   )
    fig.show()

In [None]:
import pandas as pd
import os

def json_to_csv(json_file_path: str) -> None:
    """
    Load a JSON file into a pandas DataFrame and save it as a CSV file with the same name.
    """
    df = pd.read_json(json_file_path, lines=True)
    
    csv_file_path = os.path.splitext(json_file_path)[0] + '.csv'
    df.to_csv(csv_file_path, index=False)

In [None]:
for level in ['graduated', 'incubating', 'sandbox']:
    json_to_csv(f'out/cncf-{level}-github-releases.json')

In [None]:
def plot_releases_from_csv(csv_file: str, title: str) -> None:
    csv_file =f'out/cncf-{level}-github-releases.csv'
    if os.path.exists(csv_file):
        df_releases = pd.read_csv(csv_file)
        df_releases.release_date = pd.to_datetime(df_releases.release_date)
        
        plot_releases_scatter(df_releases, title)
    else:
        print(f"CSV file {csv_file} not found")

In [None]:
import pandas as pd
import os

for level in ['graduated', 'incubating', 'sandbox']:
    plot_releases_from_csv(f'out/cncf-{level}-github-releases.csv', f'Releases: {level}')


In [None]:
import pandas as pd
import os


# plot_releases_scatter(f'out/cncf-all-github-releases.csv')



In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo.to_csv('cncf_releases_by_repo.csv')