In [1]:
from collections import defaultdict
from itertools import combinations
import json
import networkx as nx
import numpy as np
import os
import pandas as pd

# Load the unweighted graph

In [2]:
with open('../../graph/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
print("Nodes:", len(G.nodes))
repo_urls = [x for x in G.nodes]
print("Edges:", len(G.edges))

Nodes: 4303
Edges: 9896


# Grab OSS funding event data for all relevant repos

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the query below and export it to a CSV file

In [3]:
from google.cloud import bigquery

# replace with your path to credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'

# replace with your project name
client = bigquery.Client(project='opensource-observer')

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
WITH funding AS (
    SELECT
        date_trunc(time, QUARTER) AS quarter,        
        from_project_name,
        grant_pool_name,
        to_project_name,
        to_project_id,
        CAST(sum(amount) AS INT) AS total_funding_usd
    FROM `oso_production.oss_funding_v0`
    GROUP BY 1, 2, 3, 4, 5
),
repos as (
    SELECT
        project_id,
        MAX_BY(artifact_url, star_count) AS git_repo_with_most_stars
    FROM `oso_production.repositories_v0`
    WHERE artifact_url IN ({stringify_array(repo_urls)})
    GROUP BY project_id    
)
SELECT
    funding.* EXCEPT (to_project_id),
    repos.git_repo_with_most_stars
FROM funding
JOIN repos
    ON funding.to_project_id = repos.project_id
"""

# results = client.query(query)
# df = results.to_dataframe()
# df.to_parquet('../../datasets/oso/oss_funding.parquet')

df = pd.read_parquet('../../datasets/oso/oss_funding.parquet')
df.tail(5)

Unnamed: 0,quarter,from_project_name,grant_pool_name,to_project_name,total_funding_usd,git_repo_with_most_stars
1424,2020-04-01 00:00:00+00:00,opencollective,contributions,dcodeio,15000,https://github.com/dcodeio/long.js
1425,2019-10-01 00:00:00+00:00,opencollective,contributions,motdotla,258,https://github.com/motdotla/dotenv
1426,2024-01-01 00:00:00+00:00,optimism,retropgf3,lighthouse-sigp,1043482,https://github.com/sigp/lighthouse
1427,2018-07-01 00:00:00+00:00,opencollective,contributions,gregberge,233,https://github.com/gregberge/svgr
1428,2024-01-01 00:00:00+00:00,optimism,retropgf3,web3,869569,https://github.com/web3/web3.js


In [92]:
query = f"""
SELECT
    artifact_url,
    star_count,
    fork_count,
    language
FROM `oso_production.repositories_v0`
WHERE artifact_url IN ({stringify_array(repo_urls)})
ORDER BY star_count DESC
"""

# results = client.query(query)
# df_repos = results.to_dataframe()
# df_repos.to_csv('../../datasets/oso/repo_basic_stats.csv')

df_repos = pd.read_csv('../../datasets/oso/repo_basic_stats.csv', index_col=0).set_index('artifact_url')
df_repos.tail(5)

Unnamed: 0_level_0,star_count,fork_count,language
artifact_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://github.com/es-shims/arraybuffer.prototype.slice,0,0,JavaScript
https://github.com/inspect-js/typed-array-byte-length,0,0,JavaScript
https://github.com/syntax-tree/unist-util-position-from-estree,0,0,JavaScript
https://github.com/silentcicero/is-hex-prefixed,0,1,JavaScript
https://github.com/prysmaticlabs/prombbolt,0,0,Starlark


# Simulate Pairwise comparisons and create an ELO rating

- First, we organize funding decisions into quarterly groups and compares projects within the same round and quarter based on their relative funding amounts. This allows for a direct comparison of projects that were evaluated at the same time.
- Next, we use a basic ELO model and assign a rating for each project
- Finally, we run the analysis for web2 and web3 projects separately

In [93]:
def pairwise_simulation(dataframe):

    data = []
    
    funding_rounds = dataframe.groupby(['from_project_name', 'quarter'])['to_project_name'].nunique()
    funding_rounds = funding_rounds[funding_rounds > 1]
    
    for funder, quarter in funding_rounds.keys():
        dff = dataframe[(dataframe['from_project_name'] == funder) & (dataframe['quarter'] == quarter)]
        projects = list(dff['to_project_name'].unique())
        comparisons = combinations(projects, 2)
        for (project_a,project_b) in comparisons:
            amount_a = dff[dff['to_project_name'] == project_a]['total_funding_usd'].sum()
            amount_b = dff[dff['to_project_name'] == project_b]['total_funding_usd'].sum()
            repo_a = dff[dff['to_project_name'] == project_a]['git_repo_with_most_stars'].unique()[0]
            repo_b = dff[dff['to_project_name'] == project_b]['git_repo_with_most_stars'].unique()[0]
            amount_total = amount_a + amount_b
            data.append({
                'funder': funder,
                'quarter': quarter,
                'project_a': repo_a,
                'project_b': repo_b,
                'weight_a': amount_a / amount_total,
                'weight_b': amount_b / amount_total
            })

    return pd.DataFrame(data)

df_pairwise = pairwise_simulation(df)
df_pairwise.tail()

Unnamed: 0,funder,quarter,project_a,project_b,weight_a,weight_b
20646,optimism,2024-10-01 00:00:00+00:00,https://github.com/bluealloy/revm,https://github.com/ethereumjs/ethereumjs-monorepo,0.514987,0.485013
20647,optimism,2024-10-01 00:00:00+00:00,https://github.com/bluealloy/revm,https://github.com/eth-infinitism/account-abst...,0.447073,0.552927
20648,optimism,2024-10-01 00:00:00+00:00,https://github.com/status-im/nimbus-eth2,https://github.com/ethereumjs/ethereumjs-monorepo,0.460606,0.539394
20649,optimism,2024-10-01 00:00:00+00:00,https://github.com/status-im/nimbus-eth2,https://github.com/eth-infinitism/account-abst...,0.394038,0.605962
20650,optimism,2024-10-01 00:00:00+00:00,https://github.com/ethereumjs/ethereumjs-monorepo,https://github.com/eth-infinitism/account-abst...,0.432302,0.567698


In [94]:
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating_a, rating_b, score_a, k_factor):
    expected_a = expected_score(rating_a, rating_b)
    rating_a_new = rating_a + k_factor * (score_a - expected_a)
    rating_b_new = rating_b + k_factor * ((1 - score_a) - (1 - expected_a))
    return rating_a_new, rating_b_new
    
def elo_simulation(dataframe):
    
    elo_ratings = defaultdict(lambda: 1500) # Default ELO rating for all projects
    appearances = defaultdict(int)
    
    for _,row in dataframe.iterrows():
        project_a = row['project_a']
        project_b = row['project_b']
        share_a = row['weight_a']
        score_a = 1 if share_a > 0.5 else 0 if share_a < 0.5 else 0.5

        k_a = 40 / (1 + appearances[project_a] / 5)
        k_b = 40 / (1 + appearances[project_b] / 5)
        k_factor = (k_a + k_b) / 2
        
        elo_ratings[project_a], elo_ratings[project_b] = update_elo(
            elo_ratings[project_a], elo_ratings[project_b], score_a, k_factor
        )
        appearances[project_a] += 1
        appearances[project_b] += 1
    
    margin_of_error = {project: 400 / np.sqrt(appearances[project]) for project in appearances}

    return (
        pd.DataFrame([
            {
                'project': project,
                'head-to-head_comparisons': appearances[project],        
                'elo_rating': rating,        
                'margin_of_error': margin_of_error[project]
            }
            for project, rating in elo_ratings.items()
        ])
        .sort_values(by='elo_rating', ascending=False)
        .set_index('project', drop=True)
    )

In [95]:
web3 = elo_simulation(df_pairwise[df_pairwise['funder'] != 'opencollective'])
web3_projects = list(web3.index)
web3

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://github.com/ethers-io/ethers.js,197,1702.260766,28.49882
https://github.com/prysmaticlabs/prysm,118,1676.748565,36.822985
https://github.com/sigp/lighthouse,207,1675.604412,27.801922
https://github.com/libp2p/go-libp2p,36,1609.248967,66.666667
https://github.com/ethereum/solidity,80,1592.043199,44.72136
https://github.com/wevm/viem,119,1587.453234,36.66794
https://github.com/ethereum/go-ethereum,119,1579.410411,36.66794
https://github.com/web3/web3.js,33,1556.478705,69.631062
https://github.com/erigontech/erigon,14,1528.076996,106.904497
https://github.com/nomicfoundation/hardhat,153,1526.576219,32.338083


In [96]:
web2 = elo_simulation(
    df_pairwise[(~df_pairwise['project_a'].isin(web3_projects)) & (~df_pairwise['project_b'].isin(web3_projects))]
)
web2.head(40)

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://github.com/webpack/webpack,974,1859.270668,12.816827
https://github.com/babel/babel,969,1831.594509,12.849851
https://github.com/eslint/eslint,899,1794.178747,13.340747
https://github.com/vuejs/vue,964,1771.325874,12.883133
https://github.com/mochajs/mocha,976,1672.830486,12.803688
https://github.com/salesforce/tough-cookie,401,1671.403054,19.975047
https://github.com/dcodeio/long.js,366,1664.616819,20.908335
https://github.com/zloirock/core-js,899,1663.900092,13.340747
https://github.com/tokio-rs/tokio,667,1659.227687,15.488062
https://github.com/mafintosh/pump,150,1657.89597,32.659863


# Use funding data to help prune the tree

In [123]:
all_elo = pd.concat([web2, web3])
all_elo = all_elo.join(df_repos)[['language']]
df_repos_filtered = df_repos.loc[~df_repos.index.isin(all_elo.index)][['language']]
repo_rank = pd.concat([all_elo, df_repos_filtered]).reset_index()
repo_rank['owner'] = repo_rank['index'].apply(lambda x: x.split('/')[-2])
repo_rank = repo_rank.drop_duplicates(subset=['owner'], keep='first').set_index('index')

In [155]:
MAX_DEP_COUNT = 20

g_pruned = []
node_levels = {}

for seed_node, node_data in G.nodes(data=True):
    if node_data.get('level') == 1:
        node_levels[seed_node] = 1        
        all_dependencies = set(G.successors(seed_node))
        if len(all_dependencies) < MAX_DEP_COUNT:
            top_dependencies = all_dependencies
        else:
            top_dependencies = [x for x in all_dependencies if x in repo_rank.index]
            if len(top_dependencies) > MAX_DEP_COUNT:
                top_dependencies = top_dependencies[:MAX_DEP_COUNT]
        
        for dep_node in top_dependencies:
            node_levels[dep_node] = 2            
            g_pruned.append((seed_node, dep_node))

last_node = seed_node
G_pruned = nx.from_edgelist(g_pruned)
nx.set_node_attributes(G_pruned, node_levels, 'level')

print("Kept", len(G_pruned.nodes()), "nodes out of", len(G.nodes()))
print("Kept", len(G_pruned.edges()), "edges out of", len(G.edges()))

Kept 160 nodes out of 4303
Kept 269 edges out of 9896


In [156]:
print("- ethereum/")
for seed_node, node_data in G_pruned.nodes(data=True):
    if node_data.get('level') != 1:
        continue
    print("    ├")        
    print("    ├──", seed_node)
    for i, leaf_node in enumerate(sorted(G_pruned.neighbors(seed_node))):
        if seed_node != last_node:
            print("    │   ├── ", f"[{i+1}]", leaf_node)
        else:
            print("        ├── ", f"[{i+1}]", leaf_node)

- ethereum/
    ├
    ├── https://github.com/ethereum/solidity
    │   ├──  [1] https://github.com/readthedocs/sphinx_rtd_theme
    │   ├──  [2] https://github.com/sphinx-doc/sphinx
    │   ├──  [3] https://github.com/taminomara/sphinx-a4doc
    ├
    ├── https://github.com/vyperlang/vyper
    │   ├──  [1] https://github.com/agronholm/cbor2
    │   ├──  [2] https://github.com/executablebooks/sphinx-copybutton
    │   ├──  [3] https://github.com/gristlabs/asttokens
    │   ├──  [4] https://github.com/legrandin/pycryptodome
    │   ├──  [5] https://github.com/lepture/shibuya
    │   ├──  [6] https://github.com/pypa/packaging
    │   ├──  [7] https://github.com/pypa/wheel
    │   ├──  [8] https://github.com/python/importlib_metadata
    │   ├──  [9] https://github.com/sphinx-doc/sphinx
    ├
    ├── https://github.com/ethereum/py-evm
    │   ├──  [1] https://github.com/ethereum/eth-hash
    │   ├──  [2] https://github.com/hypothesisworks/hypothesis
    │   ├──  [3] https://github.com/ipyt

# Export the graph to JSON

In [11]:
print("\nLevel 1 Nodes (Sources) and Their Summed Edge Weights:")
source_weights = validate_weights(G)
pd.Series(source_weights)


Level 1 Nodes (Sources) and Their Summed Edge Weights:


https://github.com/ethereum/solidity                     0.000000
https://github.com/vyperlang/vyper                       0.000000
https://github.com/ethereum/py-evm                       0.200000
https://github.com/ethereum/web3.py                      0.800000
https://github.com/paradigmxyz/reth                      0.800000
https://github.com/consensys/teku                        0.200000
https://github.com/prysmaticlabs/prysm                   0.800000
https://github.com/erigontech/erigon                     0.800000
https://github.com/web3/web3.js                          0.800000
https://github.com/grandinetech/grandine                 0.200000
https://github.com/chainsafe/lodestar                    0.800000
https://github.com/sigp/lighthouse                       0.771429
https://github.com/ethereum/go-ethereum                  0.684211
https://github.com/status-im/nimbus-eth2                 0.400000
https://github.com/ethereum/remix-project                0.776471
https://gi

In [12]:
G_serializable = convert_graph_to_serializable(G)
graph_json = nx.node_link_data(G_serializable)
output_path = "../../graph/weighting_examples/oso_p2p_contributions_weighting.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)