In [1]:
import json
import networkx as nx
import os
import pandas as pd

# Settings

In [2]:
REFRESH_DATA = True
LOCAL_CSV_PATH = '../graph/unweighted_graph.csv'
LOCAL_JSON_PATH = '../graph/unweighted_graph.json'

In [3]:
CONSENSUS = [
    'prysmaticlabs/prysm',
    'sigp/lighthouse',
    'consensys/teku',
    'status-im/nimbus-eth2',
    'chainsafe/lodestar',
    'grandinetech/grandine'
]
EXECUTION = [
    'ethereum/go-ethereum',
    'nethermindeth/nethermind',
    'hyperledger/besu',
    'erigontech/erigon',
    'paradigmxyz/reth'
]
OTHER = [
    'ethereum/py-evm',
    'eth-infinitism/account-abstraction',
    'safe-global/safe-smart-account',
    'a16z/helios',
    'ethereumjs/ethereumjs-monorepo'    
]
DEV_GUILD = [
    'ethereum/web3.py',
    'ethers-io/ethers.js', # new
    'hyperledger-web3j/web3j', # new
    'alloy-rs/alloy', # new
    'nethereum/nethereum', # new
    'wevm/viem', # new
#    'web3/web3.js', # removed    
    
    'nomicfoundation/hardhat', # new
    'foundry-rs/foundry', # new
    'ethereum/remix-project',
    'apeworx/ape', # new
    'vyperlang/titanoboa', # new
    'ethereum-lists/chains', # new
    
    'ethereum/solidity', # includes yul
    'vyperlang/vyper',
    'ethereum/fe', # new
    
    'ethereum/sourcify', # new
    
    'openzeppelin/openzeppelin-contracts', # new
    'scaffold-eth/scaffold-eth-2' # new
]
SEED_REPOS = CONSENSUS + EXECUTION + OTHER + DEV_GUILD
PACKAGE_SERVERS = ['NPM', 'CARGO', 'GOLANG', 'PYPI']

# Get SBOMs for repos we care about

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Enter the following query into your [console](https://console.cloud.google.com/bigquery) to get a fresh copy of the graph
- Save it as a CSV file to `../graph/unweighted_graph.csv`

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
  -- COPY THIS INTO YOUR BIGQUERY CONSOLE
  
  select distinct
    sboms.from_artifact_namespace as seed_repo_owner,
    sboms.from_artifact_name as seed_repo_name,
    sboms.to_package_artifact_name as package_name,
    package_owners.package_owner_artifact_namespace as package_repo_owner,
    package_owners.package_owner_artifact_name as package_repo_name,
    sboms.to_package_artifact_source as package_source
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on
      sboms.to_package_artifact_name = package_owners.package_artifact_name
      and sboms.to_package_artifact_source = package_owners.package_artifact_source
  where
    sboms.to_package_artifact_source in ({stringify_array(PACKAGE_SERVERS)})
    and package_owners.package_owner_artifact_namespace is not null
    and concat(sboms.from_artifact_namespace, '/', sboms.from_artifact_name)
      in ({stringify_array(SEED_REPOS)})
"""

if REFRESH_DATA:

    from google.cloud import bigquery

    # replace with your path to credentials
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../oso_gcp_credentials.json'

    # replace with your project name
    client = bigquery.Client(project='opensource-observer')
    
    # execute the query and save it
    results = client.query(query)
    df = results.to_dataframe()
    df.to_csv(LOCAL_CSV_PATH)
    print("Query saved to local CSV file.")

else:    
    print(query)

Query saved to local CSV file.


# Load the graph as a CSV

In [5]:
df = pd.read_csv(LOCAL_CSV_PATH, index_col=0)

gh = 'https://github.com/'
df['seed_repo_url'] = df.apply(lambda x: f"{gh}{x['seed_repo_owner']}/{x['seed_repo_name']}", axis=1)
df['package_repo_url'] = df.apply(lambda x: f"{gh}{x['package_repo_owner']}/{x['package_repo_name']}", axis=1)

df.tail()

Unnamed: 0,seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source,seed_repo_url,package_repo_url
20587,openzeppelin,openzeppelin-contracts,min-indent,thejameskyle,min-indent,NPM,https://github.com/openzeppelin/openzeppelin-c...,https://github.com/thejameskyle/min-indent
20588,openzeppelin,openzeppelin-contracts,fill-range,jonschlinkert,fill-range,NPM,https://github.com/openzeppelin/openzeppelin-c...,https://github.com/jonschlinkert/fill-range
20589,openzeppelin,openzeppelin-contracts,punycode,mathiasbynens,punycode.js,NPM,https://github.com/openzeppelin/openzeppelin-c...,https://github.com/mathiasbynens/punycode.js
20590,openzeppelin,openzeppelin-contracts,he,mathiasbynens,he,NPM,https://github.com/openzeppelin/openzeppelin-c...,https://github.com/mathiasbynens/he
20591,openzeppelin,openzeppelin-contracts,@types/http-cache-semantics,definitelytyped,definitelytyped,NPM,https://github.com/openzeppelin/openzeppelin-c...,https://github.com/definitelytyped/definitelyt...


In [6]:
df.groupby('package_source')['package_name'].nunique()

package_source
CARGO     1363
GOLANG     387
NPM       5575
PYPI       162
Name: package_name, dtype: int64

In [7]:
for pkg in PACKAGE_SERVERS:
    print(f"\n### Most Popular {pkg} Packages ###")
    pkg_lst = df[(df['package_source']==pkg) & (df['package_repo_owner'] != '')]['package_repo_name'].value_counts()
    nth = int(len(pkg_lst) * (0.025 if pkg == 'NPM' else 0.1))
    pkg_lst_top = list(pkg_lst.head(nth).index)
    print(pkg_lst_top)


### Most Popular NPM Packages ###
['babel', 'definitelytyped', 'ethers.js', 'esbuild', 'lodash', 'lerna', 'ethereumjs-monorepo', 'cspell-dicts', 'cssnano', 'rollup', 'web3.js', 'micromark', 'webassemblyjs', 'jest', 'typescript-eslint', 'solidity-analyzer', 'sentry-javascript', 'changesets', 'proxy-agents', 'smithy-typescript', 'edr', 'change-case', 'nx', 'next.js', 'cspell', 'storybook', 'stablelib', 'conventional-changelog', 'aws-sdk-js-v3', 'hardhat', 'istanbuljs', 'protobuf.js', 'walletconnect-utils', 'primitives', 'bases', 'react', 'vitest', 'nodelib', 'watcher', 'node-rs', 'emotion', 'svgr', 'biome', 'swc', 'opentelemetry-js', 'algoliasearch-client-javascript', 'docusaurus', 'js', 'graphql-tools', 'core', 'it', 'acorn', 'undici', 'js-libp2p', 'cliui', 'cli', 'shiki', 'node-source-map-support', 'eslint', 'sigstore-js', 'webdriverio', 'once', 'inquirer.js', 'vercel', 'find-up', 'punycode.js', 'forge', 'ms', 'js-yaml', 'source-map', 'eslint-plugin-import', 'path-exists', 'locate-pat

In [8]:
owners = list(df['package_repo_owner'].unique())
len(owners)

2046

# Create a network graph

In [9]:
G = nx.DiGraph()

for seed_repo in SEED_REPOS:
    G.add_node(f"{gh}{seed_repo}", level=1)
    
for repo_url in df['package_repo_url'].unique():
    if repo_url not in G.nodes:
        G.add_node(repo_url, level=2)
        
for _, row in df.iterrows():
    G.add_edge(
        row['seed_repo_url'],
        row['package_repo_url'],
        relation=row['package_source']
    )
    
total_edges = G.number_of_edges()
print(total_edges)

global_weight = 0
for u, v in G.edges:
    G[u][v]['weight'] = global_weight
    
graph_json = nx.node_link_data(G)
with open(LOCAL_JSON_PATH, "w") as f:
    json.dump(graph_json, f, indent=2)

14927
