In [1]:
import json
import networkx as nx
import pandas as pd

# Settings

In [2]:
LOCAL_CSV_PATH = '../graph/unweighted_graph.csv'
LOCAL_JSON_PATH = '../graph/unweighted_graph.json'
CONSENSUS = [
    'prysmaticlabs/prysm',
    'sigp/lighthouse',
    'consensys/teku',
    'status-im/nimbus-eth2',
    'chainsafe/lodestar',
    'grandinetech/grandine'
]
EXECUTION = [
    'ethereum/go-ethereum',
    'nethermindeth/nethermind',
    'hyperledger/besu',
    'erigontech/erigon',
    'paradigmxyz/reth'
]
OTHER = [
    'ethereum/solidity',
    'ethereum/remix-project',
    'vyperlang/vyper',
    'ethereum/web3.py',
    'ethereum/py-evm',
    'eth-infinitism/account-abstraction',
    'safe-global/safe-smart-account',
    'a16z/helios',
    'web3/web3.js', # prev. 'ethereum/web3.js',
    'ethereumjs/ethereumjs-monorepo'    
]
SEED_REPOS = CONSENSUS + EXECUTION + OTHER
PACKAGE_SERVERS = ['NPM', 'RUST', 'GO', 'PIP']

# Get SBOMs for repos we care about

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Enter the following query into your [console](https://console.cloud.google.com/bigquery) to get a fresh copy of the graph
- Save it as a CSV file to `../graph/unweighted_graph.csv`

In [3]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
  -- COPY THIS INTO YOUR BIGQUERY CONSOLE
  
  select distinct
    sboms.from_artifact_namespace as seed_repo_owner,
    sboms.from_artifact_name as seed_repo_name,
    sboms.to_package_artifact_name as package_name,
    package_owners.package_owner_artifact_namespace as package_repo_owner,
    package_owners.package_owner_artifact_name as package_repo_name,
    sboms.to_package_artifact_source as package_source
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on
      sboms.to_package_artifact_name = package_owners.package_artifact_name
      and sboms.to_package_artifact_source = package_owners.package_artifact_source
  where
    sboms.to_package_artifact_source in ({stringify_array(PACKAGE_SERVERS)})
    and package_owners.package_owner_artifact_namespace is not null
    and concat(sboms.from_artifact_namespace, '/', sboms.from_artifact_name)
      in ({stringify_array(SEED_REPOS)})
"""
print(query)



  -- COPY THIS INTO YOUR BIGQUERY CONSOLE
  
  select distinct
    sboms.from_artifact_namespace as seed_repo_owner,
    sboms.from_artifact_name as seed_repo_name,
    sboms.to_package_artifact_name as package_name,
    package_owners.package_owner_artifact_namespace as package_repo_owner,
    package_owners.package_owner_artifact_name as package_repo_name,
    sboms.to_package_artifact_source as package_source
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on
      sboms.to_package_artifact_name = package_owners.package_artifact_name
      and sboms.to_package_artifact_source = package_owners.package_artifact_source
  where
    sboms.to_package_artifact_source in ('NPM','RUST','GO','PIP')
    and package_owners.package_owner_artifact_namespace is not null
    and concat(sboms.from_artifact_namespace, '/', sboms.from_artifact_name)
      in ('prysmaticlabs/prysm','sigp/lighthouse','consensys/teku','status-im/nimbus-eth2','chainsafe

# Load the graph as a CSV

In [4]:
df = pd.read_csv(LOCAL_CSV_PATH)

gh = 'https://github.com/'
df['seed_repo_url'] = df.apply(lambda x: f"{gh}{x['seed_repo_owner']}/{x['seed_repo_name']}", axis=1)
df['package_repo_url'] = df.apply(lambda x: f"{gh}{x['package_repo_owner']}/{x['package_repo_name']}", axis=1)

df.tail()

Unnamed: 0,seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source,seed_repo_url,package_repo_url
13531,ethereumjs,ethereumjs-monorepo,@msgpackr-extract/msgpackr-extract-darwin-x64,kriszyp,msgpackr-extract,NPM,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/kriszyp/msgpackr-extract
13532,ethereumjs,ethereumjs-monorepo,@types/aria-query,definitelytyped,definitelytyped,NPM,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/definitelytyped/definitelyt...
13533,ethereumjs,ethereumjs-monorepo,trim-lines,wooorm,trim-lines,NPM,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/wooorm/trim-lines
13534,ethereumjs,ethereumjs-monorepo,moment,moment,moment,NPM,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/moment/moment
13535,ethereumjs,ethereumjs-monorepo,socks,joshglazebrook,socks,NPM,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/joshglazebrook/socks


In [8]:
df.groupby('package_source')['package_name'].nunique()

package_source
GO       386
NPM     4661
PIP      126
RUST    1037
Name: package_name, dtype: int64

In [5]:
for pkg in PACKAGE_SERVERS:
    print(f"\n### Most Popular {pkg} Packages ###")
    pkg_lst = df[(df['package_source']==pkg) & (df['package_repo_owner'] != '')]['package_repo_name'].value_counts()
    nth = int(len(pkg_lst) * (0.025 if pkg == 'NPM' else 0.1))
    pkg_lst_top = list(pkg_lst.head(nth).index)
    print(pkg_lst_top)


### Most Popular NPM Packages ###
['babel', 'definitelytyped', 'ethers.js', 'lerna', 'lodash', 'cssnano', 'jest', 'web3.js', 'ethereumjs-monorepo', 'esbuild', 'micromark', 'cspell-dicts', 'docusaurus', 'webassemblyjs', 'svgr', 'primitives', 'typescript-eslint', 'rollup', 'proxy-agents', 'change-case', 'nx', 'solidity-analyzer', 'node-rs', 'sentry-javascript', 'conventional-changelog', 'algoliasearch-client-javascript', 'graphql-tools', 'swc', 'vitest', 'istanbuljs', 'it', 'bases', 'cli', 'js-libp2p', 'cspell', 'nodelib', 'stablelib', 'react', 'watcher', 'webdriverio', 'walletconnect-utils', 'core', 'snappy', 'emotion', 'forge', 'js', 'hardhat', 'lodestar', 'babel-polyfills', 'acorn', 'eslint-plugin-import', 'undici', 'graphql-code-generator', 'protobuf.js', 'cliui', 'sigstore-js', 'core-js', 'web3modal', 'inquirer.js', 'js-yaml', 'eslint', 'plugins', 'node-source-map-support', 'formatjs', 'once', 'remix-plugin', 'source-map', 'react-router', 'biome', 'floating-ui', 'ieee754', 'json-st

In [6]:
owners = list(df['package_repo_owner'].unique())
len(owners)

1719

# Create a network graph

In [7]:
G = nx.DiGraph()

for repo_url in df['seed_repo_url'].unique():
    G.add_node(repo_url, level=1)
    
for repo_url in df['package_repo_url'].unique():
    if repo_url not in G.nodes:
        G.add_node(repo_url, level=2)
        
for _, row in df.iterrows():
    G.add_edge(
        row['seed_repo_url'],
        row['package_repo_url'],
        relation=row['package_source']
    )
    
total_edges = G.number_of_edges()
print(total_edges)

global_weight = 0
for u, v in G.edges:
    G[u][v]['weight'] = global_weight
    
graph_json = nx.node_link_data(G)
with open(LOCAL_JSON_PATH, "w") as f:
    json.dump(graph_json, f, indent=2)

9896
