In [1]:
import json
import networkx as nx
import numpy as np
import os
import pandas as pd

# Load the unweighted graph

In [2]:
with open('../graph/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G_original = nx.node_link_graph(graph_data)
print("Nodes:", len(G_original.nodes))
repo_urls = [x for x in G_original.nodes]
print("Edges:", len(G_original.edges))

Nodes: 4303
Edges: 9896


# Grab OSS funding event data for all relevant repos

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the query below and export it to a CSV file

In [3]:
from google.cloud import bigquery

# replace with your path to credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../oso_gcp_credentials.json'

# replace with your project name
client = bigquery.Client(project='opensource-observer')

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
WITH funding AS (
    SELECT
        FORMAT_TIMESTAMP('%Y-%m', date_trunc(time, QUARTER)) AS quarter,        
        from_project_name as funder,
        grant_pool_name,
        to_project_name,
        to_project_id,
        CAST(SUM(amount) AS INT64) AS total_funding_usd
    FROM `oso_production.oss_funding_v0`
    GROUP BY 1, 2, 3, 4, 5
),
repos AS (
    SELECT
        project_id,
        artifact_url,
        star_count,
        language
    FROM `oso_production.repositories_v0`
    WHERE artifact_url IN ({stringify_array(repo_urls)})
),
top_funded_repos AS (
    SELECT
        funding.quarter,
        funding.funder,
        funding.grant_pool_name,
        funding.to_project_name as project_name,
        repos.artifact_url AS git_repo,
        repos.star_count,
        repos.language,
        funding.total_funding_usd
    FROM funding
    JOIN repos
    ON funding.to_project_id = repos.project_id
),
unfunded_repos AS (
    SELECT
        CAST(NULL AS STRING) AS quarter,
        CAST(NULL AS STRING) AS funder,
        CAST(NULL AS STRING) AS grant_pool_name,
        project_name,
        repos.artifact_url AS git_repo,
        repos.star_count,
        repos.language,
        0 AS total_funding_usd
    FROM repos
    LEFT JOIN funding
    ON repos.project_id = funding.to_project_id
    LEFT JOIN `oso_production.projects_v1` projects
    ON repos.project_id = projects.project_id
    WHERE funding.to_project_id IS NULL
)
SELECT * FROM top_funded_repos
UNION ALL
SELECT * FROM unfunded_repos
"""

# results = client.query(query)
# df = results.to_dataframe()
# df.to_parquet('../datasets/oso/oss_funding.parquet')

df = pd.read_parquet('../datasets/oso/oss_funding.parquet')
df.tail(5)

Unnamed: 0,quarter,funder,grant_pool_name,project_name,git_repo,star_count,language,total_funding_usd
9202,,,,remyoudompheng,https://github.com/remyoudompheng/bigfft,73,Go,0
9203,,,,bits-and-blooms,https://github.com/bits-and-blooms/bitset,1364,Go,0
9204,,,,dalek-cryptography,https://github.com/dalek-cryptography/subtle,254,Rust,0
9205,,,,dalek-cryptography,https://github.com/dalek-cryptography/x25519-d...,331,Rust,0
9206,,,,microsoft,https://github.com/microsoft/vscode-codicons,898,Handlebars,0


# Create a few heuristics for prioritizing repos

1. Repo is owned by a project that recieves from funding from Gitcoin, Optimism, Open Collective, etc.
2. Repo shares the same language as the dependent repo
3. Repo is the most-starred repo owned by the project

In [5]:
ranked_repos_df = (
    df
    .groupby(['git_repo', 'project_name', 'language'], as_index=False)
    .agg(
        funder_list=('funder', 'unique'),
        num_funders=('funder', 'nunique'),
        star_count=('star_count', 'max')
    )
    .sort_values(by=['num_funders', 'star_count'], ascending=False)
    .set_index('git_repo')
)
ranked_repos_df

Unnamed: 0_level_0,project_name,language,funder_list,num_funders,star_count
git_repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://github.com/ethers-io/ethers.js,ethers-io,TypeScript,"[gitcoin, opencollective, clrfund, dao-drops-d...",5,8032
https://github.com/web3/web3.js,web3,TypeScript,"[octant-golemfoundation, gitcoin, optimism]",3,19457
https://github.com/prettier-solidity/prettier-plugin-solidity,prettier-solidity,Solidity,"[gitcoin, clrfund, optimism]",3,729
https://github.com/prettier-solidity/solidity-comments-extractor,prettier-solidity,JavaScript,"[gitcoin, clrfund, optimism]",3,6
https://github.com/ethereum/go-ethereum,go-ethereum,Go,"[gitcoin, optimism]",2,47933
...,...,...,...,...,...
https://github.com/npm/move-file,npm,JavaScript,[None],0,0
https://github.com/silentcicero/is-hex-prefixed,silentcicero,JavaScript,[None],0,0
https://github.com/syntax-tree/unist-util-position-from-estree,syntax-tree,JavaScript,[None],0,0
https://github.com/wealdtech/go-bytesutil,wealdtech,Go,[None],0,0


In [6]:
MAX_DEP_COUNT = 30

G_pruned = nx.DiGraph()

for seed_node, node_data in G_original.nodes(data=True):
    if node_data.get('level') == 1:
        language = ranked_repos_df.at[seed_node, 'language']
        G_pruned.add_node(seed_node, level=1, language=language)
        
        all_dependencies = set(G_original.successors(seed_node))
        if len(all_dependencies) <= MAX_DEP_COUNT:
            dependencies_to_add = all_dependencies
            
        else:
            language_list = [language]
            if language == 'Solidity':
                language_list.append('TypeScript')
            elif language not in ['TypeScript', 'Rust', 'Python']:
                language_list = list(ranked_repos_df['language'].unique())
            
            filtered_deps = [x for x in ranked_repos_df.index if x in all_dependencies]
            filtered_repo_df = ranked_repos_df.loc[filtered_deps]
            filtered_repo_df = filtered_repo_df[filtered_repo_df['language'].isin(language_list)]

            if len(filtered_repo_df) <= MAX_DEP_COUNT:
                dependencies_to_add = list(filtered_repo_df.index)
            else:                    
                filtered_repo_df = filtered_repo_df.drop_duplicates(subset=['project_name'], keep='first')
                dependencies_to_add = list(filtered_repo_df.head(MAX_DEP_COUNT).index)
                
        for dep in dependencies_to_add:
            language = ranked_repos_df.at[dep, 'language'] if dep in ranked_repos_df.index else 'n/a (not indexed yet)'
            if dep not in G_pruned.nodes:
                G_pruned.add_node(dep, level=2, language=language)
            
            G_pruned.add_edge(seed_node, dep)

print("Kept", len(G_pruned.nodes()), "nodes out of", len(G_original.nodes()))
print("Kept", len(G_pruned.edges()), "edges out of", len(G_original.edges()))

Kept 245 nodes out of 4303
Kept 434 edges out of 9896


# Export the graph to JSON

In [7]:
output_lines = []
output_lines.append("""
# Deep Funding - Dependency Graph

The full unfiltered dependency graph is available [here](./unweighted_graph.json).

To import this graph into Python, you can use the `networkx` library.

```python
import networkx as nx
with open('./unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)
G = nx.node_link_graph(graph_data)
```

We have also done some pruning of the graph for some initial experiments using pairwise voting. This list may change and is purely for testing purposes.

## Quick Prune of Dependencies for Pairwise Testing
""")

output_lines.append("- **ethereum/**")

repo_path = lambda url: url.replace('https://github.com/','')
git_owner = lambda url: repo_path(url).split('/')[0]

for seed_node, node_data in G_pruned.nodes(data=True):
    if node_data.get('level') != 1:
        continue
    
    seed_language = node_data.get('language', 'Unknown')
    neighbors = sorted(G_pruned.successors(seed_node))
    num_neighbors = len(neighbors)
    original_neighbors = sorted(G_original.successors(seed_node))
    original_num_neighbors = len(original_neighbors)
    
    pairwise_combinations = num_neighbors * (num_neighbors - 1) // 2
    original_pairwise_combinations = original_num_neighbors * (original_num_neighbors - 1) // 2
    
    output_lines.append(" | ".join([
        f"  - **[{repo_path(seed_node)}]({seed_node})**",
        f"{seed_language}",
        f"{num_neighbors}/{original_num_neighbors} dependencies",
        f"{pairwise_combinations} pairwise combinations (vs {original_pairwise_combinations} previously)"
    ]))
    
    for i,leaf_node in enumerate(neighbors):
        leaf_data = G_pruned.nodes[leaf_node]
        leaf_language = leaf_data.get('language', 'Unknown')
            
        related_neighbors = [n for n in original_neighbors if git_owner(n) == git_owner(leaf_node)]
        num_related_neighbors = len(related_neighbors) - 1
        prefix = "    - "        
        output_lines.append(" ".join([
            f"{prefix}[{i+1}] [{repo_path(leaf_node)}]({leaf_node})",
            f" [+{num_related_neighbors} other relevant repos in same namespace]" if num_neighbors == MAX_DEP_COUNT and num_related_neighbors else "",
            f"| {leaf_language}",
        ]))

with open("../graph/README.md", "w") as f:
    f.write("\n".join(output_lines))

print("Output exported to README.md")

Output exported to README.md


In [8]:
graph_json = nx.node_link_data(G_pruned)
output_path = "../graph/unweighted_graph_pruned_version.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)