In [1]:
import json
import networkx as nx
import os
import pandas as pd
import sys

sys.path.append(os.path.abspath("../"))
from utils.serialize_graph import convert_graph_to_serializable

In [2]:
REFRESH_DATA = False
ORIGINAL_GRAPH = '../graph/unweighted_graph.json'
PRUNED_GRAPH = '../graph/unweighted_graph_pruned_version.json'
LOCAL_PARQUET_FILE = '../datasets/oso/repo_metrics_and_metadata.parquet'
PRUNED_GRAPH_METADATA = "../graph/unweighted_graph_pruned_with_metadata.json"

# Load the full graph

In [3]:
with open(ORIGINAL_GRAPH, 'r') as f:
    graph_data = json.load(f)

G_original = nx.node_link_graph(graph_data)
print("Nodes:", len(G_original.nodes))
repo_urls = [x for x in G_original.nodes]
print("Edges:", len(G_original.edges))

Nodes: 4931
Edges: 14308


# Grab high-level metrics for all relevant repos

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the query below and export it to a CSV file
- Metrics and metadata:
   - Repo path
   - Language
   - Is fork?
   - Created at
   - Updated at
   - Stars
   - Forks
   - Maintainer of (list of packages)
   - Grant recipient of (list of packages)
   - Used by (list of dependents in OSO)
   - Percentile rank (for language)   

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
WITH repos AS (
  SELECT *
  FROM `oso.repositories_v0`
),
package_owners AS (
  SELECT
    package_owner_artifact_id,
    package_artifact_source,
    package_artifact_name,
    CONCAT(package_artifact_source, '/', package_artifact_name) AS package_tag
  FROM `oso.package_owners_v0`
  WHERE package_owner_artifact_id IN (SELECT artifact_id FROM repos)
),
oso_dependents AS (
  SELECT
    package_owners.package_owner_artifact_id,
    COUNT(DISTINCT package_owners.package_tag) AS num_packages,
    COUNT(DISTINCT sboms.from_artifact_namespace) AS num_dependents_in_oso,
    ARRAY_AGG(DISTINCT package_owners.package_tag) AS list_of_packages,
    ARRAY_AGG(DISTINCT sboms.from_artifact_namespace) AS list_of_dependents_in_oso
  FROM `oso.sboms_v0` AS sboms
  JOIN package_owners
    ON sboms.to_package_artifact_name = package_owners.package_artifact_name
    AND sboms.to_package_artifact_source = package_owners.package_artifact_source
  GROUP BY 1
),
grants AS (
  SELECT
    funding.to_project_id AS project_id,
    ARRAY_AGG(DISTINCT projects.display_name) AS list_of_funders,
    SUM(funding.amount) AS total_funding_usd,
    SUM(CASE WHEN funding.time > '2023-01-01' THEN funding.amount ELSE 0 END) AS total_funding_usd_since_2023
  FROM `oso.oss_funding_v0` AS funding
  JOIN `oso.projects_v1` AS projects
    ON funding.from_project_id = projects.project_id
  WHERE funding.from_project_name IN ('gitcoin', 'octant-golemfoundation', 'opencollective', 'optimism')
  GROUP BY 1
),
combined AS (
  SELECT
    repos.artifact_url AS repo_url,
    repos.artifact_namespace AS maintainer,
    repos.language,
    repos.is_fork,
    DATE(repos.created_at) as created_at,
    DATE(repos.updated_at) as updated_at,    
    repos.star_count, 
    repos.fork_count,
    COALESCE(oso_dependents.num_packages, 0) AS num_packages,
    COALESCE(oso_dependents.num_dependents_in_oso, 0) AS num_dependents_in_oso,
    oso_dependents.list_of_dependents_in_oso,
    oso_dependents.list_of_packages,
    grants.list_of_funders,
    COALESCE(grants.total_funding_usd, 0) AS total_funding_usd,
    COALESCE(grants.total_funding_usd_since_2023, 0) AS total_funding_usd_since_2023
  FROM repos
  LEFT JOIN oso_dependents
    ON repos.artifact_id = oso_dependents.package_owner_artifact_id
  LEFT JOIN grants
    ON repos.project_id = grants.project_id
)
SELECT
  *,
  PERCENT_RANK() OVER (ORDER BY num_dependents_in_oso) AS oso_dependency_rank,
  COUNT(*) OVER (PARTITION BY language) AS num_repos_in_same_language,  
  PERCENT_RANK() OVER (PARTITION BY language ORDER BY num_dependents_in_oso) AS oso_dependency_rank_for_language
FROM combined
WHERE repo_url IN ({stringify_array(repo_urls)})

"""

if REFRESH_DATA:

    from google.cloud import bigquery

    # replace with your path to credentials
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../oso_gcp_credentials.json'

    # replace with your project name
    client = bigquery.Client(project='opensource-observer')
    
    # execute the query and save it
    results = client.query(query)
    df = results.to_dataframe()
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['updated_at'] = pd.to_datetime(df['updated_at'])    
    df.to_parquet(LOCAL_PARQUET_FILE)
    print("Query saved to local Parquet file.")

df = pd.read_parquet(LOCAL_PARQUET_FILE)
df.tail(5)

Unnamed: 0,repo_url,maintainer,language,is_fork,created_at,updated_at,star_count,fork_count,num_packages,num_dependents_in_oso,list_of_dependents_in_oso,list_of_packages,list_of_funders,total_funding_usd,total_funding_usd_since_2023,oso_dependency_rank,num_repos_in_same_language,oso_dependency_rank_for_language
1900,https://github.com/juliangruber/brace-expansion,juliangruber,JavaScript,False,2013-10-13,2024-11-28,186,36,1,3131,"[1inch, dexkit, google, filswan, ethereum, fac...",[NPM/brace-expansion],[],0.0,0.0,0.995273,978,0.992835
1901,https://github.com/isaacs/minimatch,isaacs,JavaScript,False,2011-07-16,2025-01-17,3327,252,1,3134,"[npm, web3, yearn, witnet, curvefi, dig-dao, f...",[NPM/minimatch],[],0.0,0.0,0.997374,978,0.996929
1902,https://github.com/debug-js/debug,debug-js,JavaScript,False,2011-11-29,2025-01-15,11200,944,1,3150,"[andris9, mahadao, rarible, electron, ethereum...",[NPM/debug],[Open Collective],9107.2,4228.2,0.998424,978,0.997953
1903,https://github.com/lodash/lodash,lodash,JavaScript,False,2012-04-07,2025-01-18,60049,7050,263,3152,"[npm, ssbc, web3, 1inch, yearn, broxus, c-atts...","[NPM/lodash._getnative, NPM/lodash.template, N...",[],0.0,0.0,0.99895,978,0.998976
1904,https://github.com/isaacs/inherits,isaacs,JavaScript,False,2011-04-07,2024-12-11,353,86,1,3162,"[daocoa, itheum, zondax, p2p-org, getclave, ai...",[NPM/inherits],[],0.0,0.0,0.999475,978,1.0


# Add metrics just for the pruned version of the graph

In [5]:
with open(PRUNED_GRAPH, 'r') as f:
    pruned_graph_data = json.load(f)

G_pruned = nx.node_link_graph(pruned_graph_data)
print("Nodes:", len(G_pruned.nodes))
pruned_repo_urls = [x for x in G_pruned.nodes]
print("Edges:", len(G_pruned.edges))

Nodes: 360
Edges: 740


In [6]:
def underscore_to_camelcase(string):
    words = string.split('_')
    return words[0] + ''.join(word.title() for word in words[1:])

metrics_df = (
    df
    .set_index('repo_url')
    .drop(columns=['maintainer', 'list_of_dependents_in_oso', 'list_of_packages'])
    .assign(list_of_funders=lambda x: list(x['list_of_funders']))
)

for node in G_pruned.nodes():
    if node in metrics_df.index:
        G_pruned.nodes[node]['status'] = 'indexed'
        for col in metrics_df.columns:
            G_pruned.nodes[node][underscore_to_camelcase(col)] = metrics_df.at[node, col]
    else:
        if 'language' in G_pruned.nodes[node]:
            del G_pruned.nodes[node]['language']
        G_pruned.nodes[node]['status'] = 'not_indexed'

sample_node = list(G_pruned.nodes())[101]
attrs = G_pruned.nodes[sample_node]
print("Node attributes for", sample_node)
for k,v in attrs.items():
    print(f"-{k}: {v}")

Node attributes for https://github.com/ipython/ipython
-level: 2
-language: Python
-status: indexed
-isFork: False
-createdAt: 2010-05-10 00:00:00
-updatedAt: 2025-01-18 00:00:00
-starCount: 16342
-forkCount: 4447
-numPackages: 1
-numDependentsInOso: 100
-listOfFunders: []
-totalFundingUsd: 0.0
-totalFundingUsdSince2023: 0.0
-osoDependencyRank: 0.21533613445378152
-numReposInSameLanguage: 57
-osoDependencyRankForLanguage: 0.5714285714285714


In [7]:
G_serializable = convert_graph_to_serializable(G_pruned)
graph_json = nx.node_link_data(G_serializable)
with open(PRUNED_GRAPH_METADATA, "w") as f:
    json.dump(graph_json, f, indent=2)