# CNCF Landscape: clean data

## Setup and Diagnostics

### Base imports and variables

In [37]:
import os
import sys

import pandas as pd

pd.set_option('display.max_rows', 512)
pd.set_option('display.max_columns', 512)
pd.set_option('display.width', 512)

In [38]:
# all generated output files land here
OUT_DIR='generated'

# TODO: factor out landscape ('cncf') so this can be used for landscape(s) generically (https://landscapes.dev) 
 
CNCF_LANDSCAPE_FNAME_BASE='cncf-landscape'
CNCF_LANDSCAPE_FNAME_ROOT=f'{OUT_DIR}/{CNCF_LANDSCAPE_FNAME_BASE}'

CNCF_PROJECTS_FNAME_BASE=f'cncf-projects'
CNCF_PROJECTS_FNAME_ROOT=f'{OUT_DIR}/{CNCF_PROJECTS_FNAME_BASE}'

print(f'Jupyter Kernel (venv): {sys.executable}')
print(f'Output Location:       {OUT_DIR}  (.json, .jsonl, .csv, .md, .svg, .png, ...)')
print(f'Output Landscape root: {CNCF_LANDSCAPE_FNAME_ROOT}')
print(f'Output Projects  root: {CNCF_PROJECTS_FNAME_ROOT}')

Jupyter Kernel (venv): /Users/matt/gh/cncf/landscape-graph/.venv-ipynb/bin/python
Output Location:       generated  (.json, .jsonl, .csv, .md, .svg, .png, ...)
Output Landscape root: generated/cncf-landscape
Output Projects  root: generated/cncf-projects


### DataFrame helpers: safe_set_index(), split_org_repo()

In [39]:
def safe_set_index(df:         pd.DataFrame, 
                   idx_wanted: list[str],
                   sort:       bool = True,
                   inplace:    bool = True) -> pd.DataFrame:
    '''check to see if the index is already set, else, data loss as set_index can be destructive'''
    
    idx_existing = list(df.index.names)

    if idx_wanted == idx_existing:
        print(f'\n*** WARNING: attempt to set index to what it already is thwarted! \n')
    else:
        df.set_index(idx_wanted, verify_integrity=True)
        print(f'\t Index changed from {idx_existing} --> {list(df.index.names)}') 

    if sort:
        df.sort_index(inplace=inplace)

    return df

def split_org_repo(df:      pd.DataFrame, 
                   colname: str,
                   drop:    bool = False,
                   newcol_org_name:  str = 'org_name',
                   newcol_repo_name: str = 'repo_name') -> pd.DataFrame:
    '''split_org_repo(df, colname) - org_name/repo_name --> org_name, repo_name'''
    
    if colname is None:
        raise ValueError('split_org_repo: missing colname!')

    # https://swdevnotes.com/python/2022/extract-data-from-json-in-pandas-dataframe/
    # expand=True returns a dataframe  which we can rename columns on
    
    df_newcols = df[colname].copy().str.split(pat='/', n=1, expand=True)
    df_newcols.rename(columns={0: newcol_org_name, 1: newcol_repo_name}, inplace=True)

    if drop:
        df.drop(colname, axis=1, inplace=True)

    df = pd.concat([df,df_newcols], axis=1)
    return df

#### DataFrame Helper: clean_dataframe()

In [40]:

import pandas as pd
import altair as alt

def clean_dataframe(df: pd.DataFrame, categorical_threshold: float = 0.05) -> pd.DataFrame:
    # Infer better data types for object columns
    df = df.infer_objects()

    categorical_cols = []
    # date_cols = []
    dict_cols = []
    list_cols = []
    
    for col in df.columns:
        #print(f'processing col: {col}...')
        if df[col].dtype in ['int64', 'float64']:
            df[col] = pd.to_numeric(df[col], downcast='integer' if df[col].dtype == 'int64' else 'float')
        elif df[col].dtype == 'object':
            if isinstance(df[col].iloc[0], dict):
                dict_cols.append(col)
                continue
            elif isinstance(df[col].iloc[0], list):
                list_cols.append(col)
                continue
            else:

                # TODO: handle date parsing
                # try:
                #     df[col] = pd.to_datetime(df[col])
                #     date_cols.append(col)
                # except ValueError:
                #     pass

                # try for categorical
                if all(isinstance(i, (int, float, str)) for i in df[col]): 
                    num_unique_values = df[col].nunique()
                    num_total_values = len(df[col])
                    if num_unique_values / num_total_values < categorical_threshold:
                        df[col] = df[col].astype('category')
                        categorical_cols.append(col)


    # Print summary of findings
    print(f'Columns({len(df.columns)}) Summary:')
    print(f'Categorical : {categorical_cols}')
    # print(f'Date        : {date_cols}')
    print(f'Dictionary  : {dict_cols}')
    print(f'List        : {list_cols}')

    # # Create a visual summary using Altair
    # for col in categorical_cols:
    #     chart = alt.Chart(df).mark_bar().encode(
    #         x=alt.X(col, type='nominal'),
    #         y='count()',
    #     )
    #     chart.display()

    return df

#### DataFrame Helper: compare_dataframe()

In [41]:
def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame):
    # Calculate memory usage
    memory_df1 = df1.memory_usage(deep=True, index=False)
    memory_df2 = df2.memory_usage(deep=True, index=False)

    # Calculate data types
    dtype_df1 = df1.dtypes.astype(str)
    dtype_df2 = df2.dtypes.astype(str)

    # Create a new dataframe for comparison
    comparison = pd.DataFrame({
        'Original Dtype': dtype_df1,
        'Cleaned Dtype': dtype_df2,
        'Original Memory': memory_df1,
        'Cleaned Memory': memory_df2
    })

    # Calculate memory reduction
    comparison['Memory Reduction'] = comparison['Original Memory'] - comparison['Cleaned Memory']
    comparison['Memory Reduction Ratio'] = comparison['Memory Reduction'] / comparison['Original Memory']

    total_reduction = comparison['Memory Reduction'].sum()

    print(f"\nTotal memory usage of original dataframe: {memory_df1.sum()}")
    print(f"Total memory usage of cleaned dataframe: {memory_df2.sum()}")
    print(f"Total memory reduction: {total_reduction}")

    comparison_melted = comparison[['Cleaned Memory', 'Memory Reduction']].reset_index().melt(id_vars='index')

    num_bars = comparison_melted['index'].nunique()
    bar_height = 20
    chart_height = num_bars * bar_height
    
    chart = alt.Chart(comparison_melted).mark_bar().encode(
        x=alt.X('value:Q', title='Memory (bytes)'),
        y=alt.Y('index:N', title='Column'),
        color='variable:N',
        order=alt.Order(
        'variable:N',
        sort='ascending'
        ),
        tooltip=['index:N', 'variable:N', 'value:Q']

    ).properties(
        width=800,
        height=chart_height,
        title='Original Memory Footprint := Cleaned Footprint + Reduction'
    ).interactive()

    chart.display()
    display(comparison)
    #display(comparison_melted)

    

## Load and Clean: cncf-projects.jsonl

### Load .jsonl --> df_projects

In [42]:
# %pdb on

file_path = f'{CNCF_PROJECTS_FNAME_ROOT}.jsonl'

assert os.path.exists(file_path) and os.path.getsize(file_path) > 0, f"File {file_path} does not exist or is empty."

df_projects = pd.read_json(file_path, lines=True)
df_projects = df_projects.reset_index(drop=True)


In [43]:
# for comparison purposes
df_projects_precleaned = df_projects.copy()

### cols += { subcategory, repo, org_name, repo_name }

In [44]:
# pull out subcategory from path (category / subcategory)
df_projects['subcategory'] = df_projects['path'].str.split('/').str[-1]

# https://github.com/theOrg/theRepo --> repo := theOrg/theRepo
df_projects['repo'] = df_projects['repo_url'].astype('string').str.removeprefix('https://github.com/')

# theOrg/theRepo --> org_name := theOrg, repo_name := theRepo
df_projects = split_org_repo(df_projects, 'repo')

In [45]:
# replace NaN --> -1
df_projects.stars = df_projects.stars.fillna(-1).astype('int64')
df_projects.contributorsCount = df_projects.contributorsCount.fillna(-1).astype('int64')
df_projects.enduser = df_projects.enduser.fillna(-1).astype('int64')

# float64 --> int64
df_projects.stars = df_projects.stars.astype('int64')
df_projects.contributorsCount = df_projects.contributorsCount.astype('int64')
df_projects.enduser = df_projects.enduser.astype('int64')

# int64 --> bool
df_projects.open_source = df_projects.open_source.astype('bool')

In [46]:
df_projects = clean_dataframe(df_projects)

Columns(59) Summary:
Categorical : ['project', 'crunchbase', 'license', 'headquarters', 'organization', 'category', 'amountKind', 'amount', 'marketCapAsText', 'member', 'relation', 'project_org', 'joined', 'url_for_bestpractices']
Dictionary  : ['extra', 'github_data', 'github_start_commit_data', 'image_data', 'latestCommitDate', 'releaseDate', 'latestTweetDate', 'crunchbaseData']
List        : ['repos', 'industries']


In [47]:
compare_dataframes(df_projects_precleaned, df_projects)


Total memory usage of original dataframe: 899467
Total memory usage of cleaned dataframe: 794360
Total memory reduction: 156339.0


Unnamed: 0,Original Dtype,Cleaned Dtype,Original Memory,Cleaned Memory,Memory Reduction,Memory Reduction Ratio
allow_duplicate_repo,float64,float32,1424.0,712,712.0,0.5
amount,object,category,6504.0,382,6122.0,0.941267
amountKind,object,category,11264.0,350,10914.0,0.968928
bestPracticeBadgeId,int64,int16,1424.0,356,1068.0,0.75
bestPracticePercentage,float64,float32,1424.0,712,712.0,0.5
category,object,category,13559.0,1057,12502.0,0.922044
commitsThisYear,int64,int16,1424.0,356,1068.0,0.75
contributorsCount,float64,int16,1424.0,356,1068.0,0.75
contributorsLink,object,object,20311.0,20311,0.0,0.0
crunchbase,object,category,23057.0,917,22140.0,0.960229


## Create smaller dataframe to work with

In [48]:
# NOTE: this overwrite's a global df, we should probably change this to not be destructive.

# df2=df[['B','D','F']].rename({'B':'X','D':'Y','F':'Z'}, axis=1)

# TODO: Remove
df_projects_full = df_projects.copy()

df_projects.reset_index(inplace=True, drop=True)
df_projects = df_projects[[
                    'relation',
                    'category',
                    'subcategory',
                    'id',
                    'name',
                    'flatName',
                    'repo',
                    'repos',
                    'repo_name',
                    'org_name',
                    'contributorsCount',
                    'commitsThisYear',
                    'stars',
                    'github_data',
                    'extra',
                    'industries',
                    'headquarters',
                    'open_source',
                    'image_data']].copy()


print(f"Total memory size of df_projects_full: {df_projects_full.memory_usage().sum()} bytes")
print(f"Total memory size of df_projects: {df_projects.memory_usage().sum()} bytes")


Total memory size of df_projects_full: 56544 bytes
Total memory size of df_projects: 20064 bytes


### diff by using "merge with indicator." 

The _merge column will be "left_only" for rows that are in df but not in df_dropna.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas-dataframe-merge

---

> _... \<snip/\> ..._
> 
> **how{‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’**
> Type of merge to be performed.
> 
> * left: use only keys from left frame, similar to a SQL left outer join; preserve key order.
> * right: use only keys from right frame, similar to a SQL right outer join; preserve key order.
> * outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically.
> * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.
> * cross: creates the cartesian product from both frames, preserves the order of the left keys.
> 
> _... \<snip/\> ..._
> 
>  **indicator: _bool_ or _str_, default False**
>  
> If True, adds a column to the output DataFrame called “_merge” with information on the source of each row. The column can be given a different name by providing a string argument. 
> 
> The column will have a Categorical type with the value of 
>   * “left_only” for observations whose merge key only appears in the left DataFrame
>   * “right_only” for observations whose merge key only appears in the right DataFrame
>   * “both” if the observation’s merge key is found in both DataFrames.
---

### Drop rows with embedded NaNs

In [49]:
print(f'before nulls removed: {df_projects.shape}')
df_projects_dropna = df_projects.dropna().copy()
print(f'after nulls removed: {df_projects_dropna.shape}')

difference = df_projects[['name']].merge(df_projects_dropna[['name']], 
                                         how='outer', 
                                         indicator=True).loc[lambda x : x['_merge']=='left_only']

print(f'*** {difference.shape[0]} rows with nulls excluded from analysis ***')
display(df_projects.iloc[difference.index])

# prune / trim
df_projects = df_projects_dropna

before nulls removed: (178, 19)
after nulls removed: (174, 19)
*** 4 rows with nulls excluded from analysis ***


Unnamed: 0,relation,category,subcategory,id,name,flatName,repo,repos,repo_name,org_name,contributorsCount,commitsThisYear,stars,github_data,extra,industries,headquarters,open_source,image_data
148,sandbox,Serverless,Tools,serverless-devs-serverless,Serverless Devs (Serverless),Serverless Devs (Serverless),,,,,-1,0,-1,,"{'accepted': '2022-09-14', 'clomonitor_name': ...","[Cloud Computing, Cloud Infrastructure, Non Pr...","San Francisco, California",True,"{'fileName': 'serverless-devs-serverless.svg',..."
153,sandbox,Serverless,Installable Platform,virtual-kubelet-serverless,Virtual Kubelet (Serverless),Virtual Kubelet (Serverless),,,,,-1,0,-1,,"{'accepted': '2018-12-04', 'dev_stats_url': 'h...","[Cloud Computing, Cloud Infrastructure, Non Pr...","San Francisco, California",True,"{'fileName': 'virtual-kubelet-serverless.svg',..."
176,sandbox,Wasm,Embedded Functions,kubewarden-wasm,Kubewarden (Wasm),Kubewarden (Wasm),kubewarden/kubewarden-controller,[{'url': 'https://github.com/kubewarden/kubewa...,kubewarden-controller,kubewarden,15,198,146,"{'languages': [{'name': 'Go', 'value': 268474,...",,"[Cloud Computing, Cloud Infrastructure, Non Pr...","San Francisco, California",True,"{'fileName': 'kubewarden-wasm.svg', 'hash': 'J..."
177,sandbox,Wasm,Embedded Functions,open-function-wasm,OpenFunction (Wasm),OpenFunction (Wasm),OpenFunction/OpenFunction,[{'url': 'https://github.com/OpenFunction/Open...,OpenFunction,OpenFunction,30,125,1266,"{'languages': [{'name': 'Go', 'value': 466650,...",,"[Cloud Computing, Cloud Infrastructure, Non Pr...","San Francisco, California",True,"{'fileName': 'open-function-wasm.svg', 'hash':..."


In [50]:
# df_projects

### Generate Files: categories.txt, subcategories.txt, org_names.txt

In [51]:
def list_to_file(itemlist: list, fname: str, title: "Unknown List") -> None:
    '''write list to file'''

    print(f'{title}: {itemlist} ({fname})\n')
    
    with open(fname, "w") as outfile:
        outfile.write('\n'.join(str(item) for item in itemlist))

In [52]:
categories    = df_projects['category'].drop_duplicates().tolist()
subcategories = df_projects['subcategory'].drop_duplicates().tolist()
org_names     = df_projects['org_name'].drop_duplicates().tolist()

list_to_file(categories,    f'{OUT_DIR}/categories.txt',    'CATEGORIES')
list_to_file(subcategories, f'{OUT_DIR}/subcategories.txt', 'SUBCATEGORIES')
list_to_file(org_names,     f'{OUT_DIR}/org_names.txt',     'ORG_NAMES')

!ls -lh generated/*.txt
!wc -l generated/*.txt

CATEGORIES: ['Provisioning', 'Runtime', 'Orchestration & Management', 'App Definition and Development', 'Platform', 'Serverless', 'Observability and Analysis', 'Wasm'] (generated/categories.txt)

SUBCATEGORIES: [' Automation & Configuration', ' Container Registry', ' Security & Compliance', ' Key Management', ' Cloud Native Storage', ' Container Runtime', ' Cloud Native Network', ' Scheduling & Orchestration', ' Coordination & Service Discovery', ' Remote Procedure Call', ' Service Proxy', ' API Gateway', ' Service Mesh', ' Database', ' Streaming & Messaging', ' Application Definition & Image Build', ' Continuous Integration & Delivery', ' Certified Kubernetes - Distribution', ' Certified Kubernetes - Installer', ' Framework', ' Installable Platform', ' Monitoring', ' Feature Flagging', ' Logging', ' Tracing', ' Chaos Engineering', ' Continuous Optimization', ' Runtimes'] (generated/subcategories.txt)

ORG_NAMES: ['project-akri', 'cdk8s-team', 'cloud-custodian', 'devstream-io', 'kcl-la

In [53]:
# write to feather files (so we don't lose dtypes)
df_projects.to_feather(f'{CNCF_PROJECTS_FNAME_ROOT}.feather')
!ls -lh {CNCF_PROJECTS_FNAME_ROOT}.*

-rw-r--r--  1 matt  staff   419K Nov 28 20:57 generated/cncf-projects.feather
-rw-r--r--  1 matt  staff   4.0M Nov 28 20:55 generated/cncf-projects.jsonl
