# CNCF Landscape: {Category, Subcategory, Project } --> Stars, Commits, Contributors, Activity

## Setup and Diagnostics

On JupyterLab Shell Integration(s) and SList

More Info: _http://safaribooksonline.com/blog/2014/02/12/using-shell-commands-effectively-ipython_

---

> SList instances can be used like a regular list, but they provide several methods that are useful when working with shell output. The main properties available in an SList instance are:
> 
> * `.s` returns the elements joined together by spaces. 
>   * _This is useful for building command lines that take many arguments in a single invocation._
> * `.n` returns the elements joined together by a newline. 
>   * _Use this when you need the original output unmodified._
> * `.p` returns the elements as path objects, if they are filenames.
>   * _Use this when doing more advanced path manipulation_
> 
> In addition, SList instances support `grep()` and `fields()` methods. 

---

In [None]:
!pip list

In [None]:
%load_ext jupyter_ai_magics

In [None]:
%ai list

In [None]:
%ai help

### Base imports and variables

In [None]:
import os
import sys

import pandas as pd

pd.set_option('display.max_rows', 512)
pd.set_option('display.max_columns', 512)
pd.set_option('display.width', 512)

from dotenv import load_dotenv
load_dotenv()

In [None]:
# all generated output files land here
OUT_DIR='generated'

# TODO: factor out landscape ('cncf') so this can be used for landscape(s) generically (https://landscapes.dev) 
 
CNCF_LANDSCAPE_FNAME_BASE='cncf-landscape'
CNCF_LANDSCAPE_FNAME_ROOT=f'{OUT_DIR}/{CNCF_LANDSCAPE_FNAME_BASE}'

CNCF_PROJECTS_FNAME_BASE=f'cncf-projects'
CNCF_PROJECTS_FNAME_ROOT=f'{OUT_DIR}/{CNCF_PROJECTS_FNAME_BASE}'

print(f'Jupyter Kernel (venv): {sys.executable}')
print(f'Output Location:       {OUT_DIR}  (.json, .jsonl, .csv, .md, .svg, .png, ...)')
print(f'Output Landscape root: {CNCF_LANDSCAPE_FNAME_ROOT}')
print(f'Output Projects  root: {CNCF_PROJECTS_FNAME_ROOT}')

### Create human friendly JSON (.json) and data friendly JSON Lines (.jsonl) from current landcape

In [None]:
!mkdir -p {OUT_DIR}

!wget -O {CNCF_LANDSCAPE_FNAME_ROOT}.json.compact https://landscape.cncf.io/data/items.json
!ls -lh {CNCF_LANDSCAPE_FNAME_ROOT}.json.compact

In [None]:
# create human friendly file
!jq . {CNCF_LANDSCAPE_FNAME_ROOT}.json.compact > {CNCF_LANDSCAPE_FNAME_ROOT}.json
!ls -lh {CNCF_LANDSCAPE_FNAME_ROOT}.json*
!echo "\n*Yes* indeed, that's 2+ MB of whitespace!\n"

In [None]:
# array of JSON --> JSONL
!jq  -c '.[]'  {CNCF_LANDSCAPE_FNAME_ROOT}.json.compact >  {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl
!ls -lh {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl
!wc -l  {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl

### Filter Landscape: ~2200+ cards (cncf-landscape.jsonl) -->  ~180 CNCF Projects (cncf-projects.jsonl) 

In [None]:
!ls -lahF {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl
!wc -l    {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl
!echo ""

!set -x && jq -c 'select(.relation == "graduated" or .relation == "incubating" or .relation == "sandbox")' {CNCF_LANDSCAPE_FNAME_ROOT}.jsonl > {CNCF_PROJECTS_FNAME_ROOT}.jsonl 

!echo ""
!ls -lahF {CNCF_PROJECTS_FNAME_ROOT}.jsonl
!wc -l {CNCF_PROJECTS_FNAME_ROOT}.jsonl

### DataFrame helpers: safe_set_index(), split_org_repo()

In [None]:
def safe_set_index(df:         pd.DataFrame, 
                   idx_wanted: list[str],
                   sort:       bool = True,
                   inplace:    bool = True) -> pd.DataFrame:
    '''check to see if the index is already set, else, data loss as set_index can be destructive'''
    
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        df.set_index(idx_wanted, verify_integrity=True, inplace=inplace)
        print(f'\t Index changed from {idx_existing} --> {list(df.index.names)}') 

    if sort:
        df.sort_index(inplace=inplace)

    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_name'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe/
    # expand=True returns a dataframe  which we can rename columns on
    
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)
    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

### DataFrame Helper: clean_dataframe(df, categorical_threshold)

## Load and Clean: cncf-projects.jsonl

### Load .jsonl --> df_projects

In [None]:
# %pdb on
file_path = f'{CNCF_PROJECTS_FNAME_ROOT}.jsonl'
assert os.path.exists(file_path) and os.path.getsize(file_path) > 0, f"File {file_path} does not exist or is empty."

df_projects = pd.read_json(file_path, lines=True)
print(df_projects.info())

### cols += { subcategory, repo, org_name, repo_name }

In [None]:
# pull out subcategory from path (category / subcategory)
df_projects['subcategory'] = df_projects['path'].str.split('/').str[-1]

# https://github.com/theOrg/theRepo --> repo := theOrg/theRepo
df_projects['repo'] = df_projects['repo_url'].astype('string').str.removeprefix('https://github.com/')

# theOrg/theRepo --> org_name := theOrg, repo_name := theRepo
df_projects = split_org_repo(df_projects, 'repo')

In [None]:
# replace NaN --> -1
df_projects.stars = df_projects.stars.fillna(-1).astype('int64')
df_projects.contributorsCount = df_projects.contributorsCount.fillna(-1).astype('int64')
df_projects.enduser = df_projects.enduser.fillna(-1).astype('int64')

# float64 --> int64
df_projects.stars = df_projects.stars.astype('int64')
df_projects.contributorsCount = df_projects.contributorsCount.astype('int64')
df_projects.enduser = df_projects.enduser.astype('int64')

# int64 --> bool
df_projects.open_source = df_projects.open_source.astype('bool')

In [None]:
df_projects.info()

## Create smaller dataframe to work with

In [None]:
# NOTE: this overwrite's a global df, we should probably change this to not be destructive.

# df2=df[['B','D','F']].rename({'B':'X','D':'Y','F':'Z'}, axis=1)

# TODO: Remove
df_projects_full = df_projects.copy()

df_projects.reset_index(inplace=True)
df_projects = df_projects[[
                    'relation',
                    'category',
                    'subcategory',
                    'id',
                    'name',
                    'flatName',
                    'repo',
                    'repos',
                    'repo_name',
                    'org_name',
                    'contributorsCount',
                    'commitsThisYear',
                    'stars',
                    'github_data',
                    'extra',
                    'industries',
                    'headquarters',
                    'image_data']].copy()

df_projects.head()


### diff by using "merge with indicator." 

The _merge column will be "left_only" for rows that are in df but not in df_dropna.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas-dataframe-merge

---

> _... \<snip/\> ..._
> 
> **how{‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’**
> Type of merge to be performed.
> 
> * left: use only keys from left frame, similar to a SQL left outer join; preserve key order.
> * right: use only keys from right frame, similar to a SQL right outer join; preserve key order.
> * outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically.
> * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.
> * cross: creates the cartesian product from both frames, preserves the order of the left keys.
> 
> _... \<snip/\> ..._
> 
>  **indicator: _bool_ or _str_, default False**
>  
> If True, adds a column to the output DataFrame called “_merge” with information on the source of each row. The column can be given a different name by providing a string argument. 
> 
> The column will have a Categorical type with the value of 
>   * “left_only” for observations whose merge key only appears in the left DataFrame
>   * “right_only” for observations whose merge key only appears in the right DataFrame
>   * “both” if the observation’s merge key is found in both DataFrames.
---

### Drop rows with embedded NaNs

In [None]:
print(f'before nulls removed: {df_projects.shape}')
df_projects_dropna = df_projects.dropna().copy()
print(f'after nulls removed: {df_projects_dropna.shape}')

difference = df_projects[['name']].merge(df_projects_dropna[['name']], 
                                         how='outer', 
                                         indicator=True).loc[lambda x : x['_merge']=='left_only']

print(f'*** {difference.shape[0]} rows with nulls excluded from analysis ***')
display(df_projects.iloc[difference.index])

# prune / trim
df_projects = df_projects_dropna

In [None]:
# df_projects

### Categorical, numeric width, NaN

In [None]:
# for comparison purposes
df_precleaned = df_projects.copy()

#### DataFrame Helpers: clean_dataframe(), compare_dataframe()

In [None]:

import pandas as pd
import altair as alt

def clean_dataframe(df: pd.DataFrame, categorical_threshold: float = 0.05) -> pd.DataFrame:
    # Infer better data types for object columns
    df = df.infer_objects()

    categorical_cols = []
    # date_cols = []
    dict_cols = []
    list_cols = []
    
    for col in df.columns:
        print(f'processing col: {col}...')
        if df[col].dtype in ['int64', 'float64']:
            df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')
        elif df[col].dtype == 'object':
            if isinstance(df[col].iloc[0], dict):
                dict_cols.append(col)
                continue
            elif isinstance(df[col].iloc[0], list):
                list_cols.append(col)
                continue
            else:

                # TODO: handle date parsing
                # try:
                #     df[col] = pd.to_datetime(df[col])
                #     date_cols.append(col)
                # except ValueError:
                #     pass

                # try for categorical
                if all(isinstance(i, (int, float, str)) for i in df[col]): 
                    num_unique_values = df[col].nunique()
                    num_total_values = len(df[col])
                    if num_unique_values / num_total_values < categorical_threshold:
                        df[col] = df[col].astype('category')
                        categorical_cols.append(col)


    # Print summary of findings
    print(f'Columns({len(df.columns)}) Summary:')
    print(f'Categorical : {categorical_cols}')
    # print(f'Date        : {date_cols}')
    print(f'Dictionary  : {dict_cols}')
    print(f'List        : {list_cols}')

    # Create a visual summary using Altair
    # for col in categorical_cols:
    #     chart = alt.Chart(df).mark_bar().encode(
    #         x=alt.X(col, type='nominal'),
    #         y='count()',
    #     )
    #     chart.display()

    return df

In [None]:
def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame):
    # Calculate memory usage
    memory_df1 = df1.memory_usage(deep=True)
    memory_df2 = df2.memory_usage(deep=True)

    # Calculate data types
    dtype_df1 = df1.dtypes.astype(str)
    dtype_df2 = df2.dtypes.astype(str)

    # Create a new dataframe for comparison
    comparison = pd.DataFrame({
        'Original Dtype': dtype_df1,
        'Cleaned Dtype': dtype_df2,
        'Original Memory': memory_df1,
        'Cleaned Memory': memory_df2
    })

    # Calculate memory reduction
    comparison['Memory Reduction'] = comparison['Original Memory'] - comparison['Cleaned Memory']

    # Print the comparison dataframe
    print(comparison)

    # Print total memory usage and reduction
    total_reduction = comparison['Memory Reduction'].sum()
    print(f"\nTotal memory usage of original dataframe: {memory_df1.sum()}")
    print(f"Total memory usage of cleaned dataframe: {memory_df2.sum()}")
    print(f"Total memory reduction: {total_reduction}")

    # Create a bar chart using Altair
    comparison = comparison.reset_index().melt('index', var_name='Category', value_name='Value')
    chart = alt.Chart(comparison).mark_bar().encode(
        x='index:N',
        y='Value:Q',
        color='Category:N',
        tooltip=['index:N', 'Value:Q', 'Category:N']
    ).interactive()

    # Display the chart
    chart.display()

In [None]:
df_projects_cleaned = clean_dataframe(df_projects)

In [None]:
compare_dataframes(df_projects, df_projects_cleaned)

In [None]:
df_projects = df_projects_cleaned

### Generate Files: categories.txt, subcategories.txt, org_names.txt

In [None]:
def list_to_file(itemlist: list, fname: str, title: "Unknown List") -> None:
    '''write list to file'''

    print(f'{title}: {itemlist} ({fname})\n')
    
    with open(fname, "w") as outfile:
        outfile.write('\n'.join(str(item) for item in itemlist))

In [None]:
categories    = df_projects['category'].drop_duplicates().tolist()
subcategories = df_projects['subcategory'].drop_duplicates().tolist()
org_names     = df_projects['org_name'].drop_duplicates().tolist()

list_to_file(categories,    f'{OUT_DIR}/categories.txt',    'CATEGORIES')
list_to_file(subcategories, f'{OUT_DIR}/subcategories.txt', 'SUBCATEGORIES')
list_to_file(org_names,     f'{OUT_DIR}/org_names.txt',     'ORG_NAMES')

# *** INSERT GHARCHIVE DATASET HERE

## Numeric Aggregations

In [None]:
#safe_set_index(df, ['relation', 'category', 'subcategory', 'id'])
safe_set_index(df_projects, ['relation', 'category', 'subcategory', 'id'])

df_numeric = df_projects.select_dtypes(include=['int64', 'int32', 'float64', 'float32'])
df_numeric

In [None]:
# Aggregation at level 0
df_level0_relation = df_numeric.groupby(level=0).sum()  # or .mean(), .count(), etc.

# Aggregation at level 1
df_level1_category = df_numeric.groupby(level=1).sum()  # or .mean(), .count(), etc.

# Aggregation at level 2
df_level2_subcategory = df_numeric.groupby(level=2).sum()  # or .mean(), .count(), etc.

# Aggregation at level 3
# df_level3 = df.groupby(level=3).sum()  # or .mean(), .count(), etc.

In [None]:
print(f'Aggregations at df_level0_relation\n\n{df_level0_relation}\n')
print(f'Aggregations at df_level1_category\n\n{df_level1_category}\n')
print(f'Aggregations at df_level2_subcategory\n\n{df_level2_subcategory}\n')

In [None]:
# print(f'df.index.names: {df.index.names}\n\n')
# #print(f'df.index.levels: {df.index.levels}\n\n')

# for level in df.index.levels:
#     print(f'level: {level}\n')

## Generate Sunbursts and Treemaps for { Contributors Count, Commits this year, Stars }

### Plotly Imports & Helpers

In [None]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

# https://plotly.com/python/pandas-backend
pd.options.plotting.backend = "plotly"

In [None]:
# Light to Dark Transition:
# color_discrete_map={'sandbox': '#ADD8E6', 'incubating': '#87CEEB', 'graduated': '#000080'}
#
# sandbox: Light Blue (#ADD8E6)
# incubating: Medium Blue (#87CEEB)
# graduated: Dark Blue (#000080)

# Warm to Cool Transition:
# color_discrete_map={'sandbox': '#FFA500', 'incubating': '#FFD700', 'graduated': '#008000'}
#
# sandbox: Orange (#FFA500)
# incubating: Yellow (#FFD700)
# graduated: Green (#008000)

# Warm to Cool 2 Transition:
# color_discrete_map={'sandbox': '#FFD700', 'incubating': '#87CEEB', 'graduated': '#008000'}
#
# sandbox: Yellow (#FFD700)
# incubating: Medium Blue (#87CEEB)
# graduated: Green (#008000)

color_discrete_map_pastel = {
    "App Definition and Development": "#a2cffe",
    "Observability and Analysis": "#8efac1",
    "Orchestration & Management": "#fc9d9a",
    "Platform": "#c0eb75",
    "Provisioning": "#f2a2e8",
    "Runtime": "#fffe7a",
    "Serverless": "#d3d3d3"
}

color_discrete_map1 = {
    "App Definition and Development": "#264653",
    "Observability and Analysis": "#2a9d8f",
    "Orchestration & Management": "#e9c46a",
    "Platform": "#f4a261",
    "Provisioning": "#e76f51",
    "Runtime": "#6d6875",
    "Serverless": "#fca311"
}


color_discrete_map2 = {
    "App Definition and Development": "#003f5c",
    "Observability and Analysis": "#58508d",
    "Orchestration & Management": "#bc5090",
    "Platform": "#ff6361",
    "Provisioning": "#ffa600",
    "Runtime": "#2f4b7c",
    "Serverless": "#665191"
}
color_discrete_map3 = {
    "App Definition and Development": "#165aa7",
    "Observability and Analysis": "#cb495c",
    "Orchestration & Management": "#bb60d5",
    "Platform": "#f47915",
    "Provisioning": "#06ab54",
    "Runtime": "#002070",
    "Serverless": "#b27d12"
}

color_discrete_map4 = {
    "App Definition and Development": "#1f77b4",
    "Observability and Analysis": "#ff7f0e",
    "Orchestration & Management": "#2ca02c",
    "Platform": "#d62728",
    "Provisioning": "#9467bd",
    "Runtime": "#8c564b",
    "Serverless": "#e377c2"
}

In [None]:
def create_figure(plotly_func,
                  df,
                  values=None,
                  height: int = 1200,
                  width: int = 1200,
                  title: str = 'Missing Title',
                  path=['category', 'subcategory', 'id'],  # TODO: add 4th level: repo_name
                  color='category',
                  color_discrete_map=color_discrete_map2,
                  branchvalues: str = None) -> go.Figure:
    
    fig = plotly_func(data_frame=df,
                      values=values,
                      height=height,
                      width=width,
                      title=title,
                      path=path,
                      color=color,
                      color_discrete_map=color_discrete_map,
                      branchvalues='total')
    return fig


def create_sunburst(df, **kwargs) -> go.Figure:
    return create_figure(px.sunburst, df, **kwargs)


def create_treemap(df, **kwargs) -> go.Figure:
    return create_figure(px.treemap, df, **kwargs)

### Create Figures (Sunbursts, Treemaps) w/ plotly 

WARNING WARNING WARNING: Until the full repo sets are included, these are **HIGHLY INACCURATE**!

Presently they **ONLY** contain contributions for the singular repo listed in the landscape, instead of the full set of repos.  

For example, open-telemetry is not just one single repo (the Java Agent), nor is Kubernetes simply https://github.com/kubernetes/kubernetes

In [None]:
# make index columns accessible for charting as normal columns
df_reset = df_projects.reset_index()

figs = {}

In [None]:

sunb_contributorsCount = create_sunburst(df_reset, values='contributorsCount', title='sunburst: 🪴 Contributor Count (NOT UNIQUE ACROSS PROJECTS!) 🪴')
sunb_commitsThisYear   = create_sunburst(df_reset, values='commitsThisYear',   title='sunburst: 📄 Commits This Year 📄')
sunb_stars             = create_sunburst(df_reset, values='stars',             title='sunburst: ⭐ Stars ⭐')

tree_contributorsCount = create_treemap(df_reset, values='contributorsCount',  title='treemap: 🪴 Contributor Count (NOT UNIQUE ACROSS PROJECTS!) 🪴')
tree_commitsThisYear   = create_treemap(df_reset, values='commitsThisYear',    title='treemap: 📄 Commits This Year 📄')
tree_stars             = create_treemap(df_reset, values='stars',              title='treemap: ⭐ Stars ⭐')

figs['sunb_contributorsCount'] = sunb_contributorsCount
figs['sunb_commitsThisYear']   = sunb_commitsThisYear
figs['sunb_stars']             = sunb_stars

figs['tree_contributorsCount'] = tree_contributorsCount
figs['tree_commitsThisYear']   = tree_commitsThisYear
figs['tree_stars']             = tree_stars

for key, fig in figs.items():
    file_name = f'{OUT_DIR}/fig_{key}.svg'
    fig.write_image(file_name, format='svg')

    # Emit raw markdown for image description
    markdown = f"![Image description]({file_name})"
    print(f"```{markdown}```")

### Display Sunbursts

In [None]:
figs['sunb_contributorsCount'].show()

In [None]:
figs['sunb_stars'].show()

In [None]:
figs['sunb_commitsThisYear'].show()

### Display Treemaps

In [None]:
figs['tree_contributorsCount'].show()

In [None]:
figs['tree_stars'].show()

In [None]:
figs['tree_commitsThisYear'].show()

### images for github rendering

![Image description](generated/fig_sunb_contributorsCount.svg)
![Image description](generated/fig_sunb_commitsThisYear.svg)
![Image description](generated/fig_sunb_stars.svg)
![Image description](generated/fig_tree_contributorsCount.svg)
![Image description](generated/fig_tree_commitsThisYear.svg)
![Image description](generated/fig_tree_stars.svg)

## Generate Per TAG views.

In [None]:
print(df.info())
print(f'index.names: {df.index.names}')
df.head()

In [None]:
safe_set_index(df, ['category', 'subcategory', 'relation', 'id'])

df.info()
print(f'index.names: {df.index.names}')

In [None]:
# debug - just Observability TAG Projects
#repos_by_relation = df.query("`category` == 'Observability and Analysis'")[['relation','repo', 'name']].copy()

repos_by_relation = df.copy().reset_index()

repos_by_relation.groupby('relation')['repo'].agg(lambda x: list(x)).to_dict()

safe_set_index(repos_by_relation, idx_wanted=['relation', 'name'])
repos_by_relation.sort_index()

In [None]:
graduated_single_repos  = repos_by_relation.loc['graduated', :]['repo'].tolist()
incubating_single_repos = repos_by_relation.loc['incubating', :]['repo'].tolist()
sandbox_single_repos    = repos_by_relation.loc['sandbox', :]['repo'].tolist()

display(graduated_single_repos, incubating_single_repos, sandbox_single_repos)

## Fetch project release data from GitHub API

In [None]:
import time
import os
import json
import pandas as pd
from typing import List
from datetime import datetime, timezone
from github import Github, GithubException

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

def fetch_repo_data(token: str, 
                    repo_list: List[str], 
                    since: datetime=None, 
                    json_file: str=None, 
                    csv_file: str=None, 
                    state_file: str=None) -> pd.DataFrame:

    # Initialize DataFrame
    # df = pd.DataFrame(columns=[
    #     'repo_name', 'release_name', 'release_date', 
    #     'language', 'release_notes'
    # ])

    df = pd.DataFrame(columns=['repo_name', 'release_name', 'release_date', 'language'])

    # Initialize GitHub client
    g = Github(token)

    # Initialize loop state
    if state_file:
        try:
            with open(state_file, 'r') as f:
                state = json.load(f)
        except FileNotFoundError:
            state = {'i': 0, 'repos_done': []}
    else:
        state = {'i': 0, 'repos_done': []}

    # Loop over repositories
    while state['i'] < len(repo_list):
        repo_str = repo_list[state['i']]

        if repo_str in state['repos_done']:
            print(f"Skipping: {repo_str}")
            state['i'] += 1
            continue

        while True:
            try:
                repo = g.get_repo(repo_str)
                break
            except GithubException as e:
                if e.status == 404:
                    print(f"Repository {repo_str} not found")
                    break
                elif e.status == 429:
                    print(f"Rate limit exceeded, waiting for {e.headers['Retry-After']} seconds...")
                    time.sleep(int(e.headers['Retry-After']))
                else:
                    print(f"Error getting repository {repo_str}: {e}")
                    break

        if not repo:
            state['i'] += 1
            continue

        #
        # Get all releases
        #
        releases = repo.get_releases()
        language = repo.language

        for release in releases:
            if since is None or release.created_at >= since:
                df = pd.concat([df, pd.DataFrame({
                    'repo_name': [repo_str],
                    'release_name': [release.title],
                    'release_date': [str(release.published_at)],
                    'language': [language],
                    #'release_notes': [release.body]
                })])
                print(f"Added {release.published_at}, {repo_str}::{release.title}  ")

        # Save state
        if state_file:
            state['repos_done'].append(repo_str)
            with open(state_file, 'w') as f:
                json.dump(state, f, indent=4)

        state['i'] += 1

    #print (releases)
    
    # Save as CSV
    if csv_file:
        df.to_csv(csv_file, index=False)

    # Save as JSON
    if json_file:
        df.to_json(json_file, orient='records', lines=True)
    return df

In [None]:
!mkdir -p out

In [None]:
def fetch_one(token, since_date, level, repos):

    json_file=f'out/{level}-github-releases.json' 
    csv_file=f'out/{level}-github-releases.csv'
    state_file=f'out/.nukeme_state_file_{level}'
    
    print(f"Fetching {len(repos)} repositories for {level} projects")
    
    releases = fetch_repo_data( token, 
                                repos, 
                                since=since_date,
                                json_file=json_file,
                                csv_file=csv_file)

In [None]:
token = os.environ['GITHUB_TOKEN']
since_date = datetime(2022, 11, 7, tzinfo=timezone.utc)

In [None]:
fetch_one(token, since_date, 'cncf-graduated', graduated_single_repos)

In [None]:
fetch_one(token, since_date, 'cncf-incubating', incubating_single_repos)

In [None]:
fetch_one(token, since_date, 'cncf-sandbox', sandbox_single_repos)

## Visualize Releases

In [None]:
%pip install ipympl

%matplotlib inline
%matplotlib widget

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

# def plot_releases_timeline(releases: pd.DataFrame):
#     fig = px.timeline(releases, x_start="release_date", x_end="release_date", y="repo_name", color="language", title="GitHub Releases Timeline")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

# def plot_releases_scatter_simple(releases: pd.DataFrame):
#     # Filter releases by year
#     releases_2023 = releases[releases['release_date'].dt.year == 2023]

#     # Create scatter plot
#     fig = px.scatter(releases_2023, x="release_date", y="repo_name", color="language")
#     fig.update_yaxes(autorange="reversed")
#     fig.show()

def plot_releases_scatter(releases: pd.DataFrame, title: str=None):
    # # Filter releases by year
    # releases_2023 = releases[releases['release_date'].dt.year == 2023]

    if title is None:
        title = "GitHub Releases Timeline"

    # Group releases by organization
    releases['organization'] = releases['repo_name'].apply(lambda x: x.split('/')[0])

    # Create scatter plot
    fig = px.scatter(releases, x="release_date", y="repo_name", color="organization", symbol="language", title="Project Releases")
    fig.update_yaxes(autorange="reversed")
    
    fig.update_layout(showlegend=True,
                      autosize=True,
                      width=1000)
                    #   height=2500,
                    #   )
    fig.show()

In [None]:
import pandas as pd
import os

def json_to_csv(json_file_path: str) -> None:
    """
    Load a JSON file into a pandas DataFrame and save it as a CSV file with the same name.
    """
    df = pd.read_json(json_file_path, lines=True)
    
    csv_file_path = os.path.splitext(json_file_path)[0] + '.csv'
    df.to_csv(csv_file_path, index=False)

In [None]:
for level in ['graduated', 'incubating', 'sandbox']:
    json_to_csv(f'out/cncf-{level}-github-releases.json')

In [None]:
def plot_releases_from_csv(csv_file: str, title: str) -> None:
    csv_file =f'out/cncf-{level}-github-releases.csv'
    if os.path.exists(csv_file):
        df_releases = pd.read_csv(csv_file)
        df_releases.release_date = pd.to_datetime(df_releases.release_date)
        
        plot_releases_scatter(df_releases, title)
    else:
        print(f"CSV file {csv_file} not found")

In [None]:
import pandas as pd
import os

for level in ['graduated', 'incubating', 'sandbox']:
    plot_releases_from_csv(f'out/cncf-{level}-github-releases.csv', f'Releases: {level}')


In [None]:
import pandas as pd
import os


# plot_releases_scatter(f'out/cncf-all-github-releases.csv')



In [None]:
releases_by_repo = releases[['repo_name', 'release_date']].groupby('repo_name').count()
releases_by_repo.to_csv('cncf_releases_by_repo.csv')