In [1]:
from collections import defaultdict
from itertools import combinations
import json
import networkx as nx
import numpy as np
import os
import pandas as pd

# Load the unweighted graph

In [2]:
with open('../../graph/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
print("Nodes:", len(G.nodes))
repo_urls = [x for x in G.nodes]
print("Edges:", len(G.edges))

Nodes: 4303
Edges: 9896


# Grab OSS funding event data for all relevant repos

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the queries below

In [3]:
from google.cloud import bigquery

# replace with your path to credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'

# replace with your project name
client = bigquery.Client(project='opensource-observer')

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
WITH funding AS (
    SELECT
        date_trunc(time, QUARTER) AS quarter,        
        from_project_name,
        grant_pool_name,
        to_project_name,
        to_project_id,
        CAST(sum(amount) AS INT) AS total_funding_usd
    FROM `oso_production.oss_funding_v0`
    GROUP BY 1, 2, 3, 4, 5
),
repos as (
    SELECT
        project_id,
        MAX_BY(artifact_url, star_count) AS git_repo_with_most_stars
    FROM `oso_production.repositories_v0`
    WHERE artifact_url IN ({stringify_array(repo_urls)})
    GROUP BY project_id    
)
SELECT
    funding.* EXCEPT (to_project_id),
    repos.git_repo_with_most_stars
FROM funding
JOIN repos
    ON funding.to_project_id = repos.project_id
"""

results = client.query(query)
df = results.to_dataframe()
df.tail(5)

Unnamed: 0,quarter,from_project_name,grant_pool_name,to_project_name,total_funding_usd,git_repo_with_most_stars
1537,2024-07-01 00:00:00+00:00,opencollective,contributions,mikemcl,1,https://github.com/mikemcl/bignumber.js
1538,2021-10-01 00:00:00+00:00,opencollective,contributions,wooorm,4620,https://github.com/wooorm/markdown-table
1539,2024-04-01 00:00:00+00:00,opencollective,contributions,alexeyraspopov,5,https://github.com/alexeyraspopov/picocolors
1540,2024-01-01 00:00:00+00:00,optimism,retropgf3,ipfs,790515,https://github.com/ipfs/js-ipfs
1541,2021-04-01 00:00:00+00:00,gitcoin,GG-09,ethers-io,5650,https://github.com/ethers-io/ethers.js


In [5]:
query = f"""
SELECT
    artifact_url,
    star_count,
    fork_count,
    language
FROM `oso_production.repositories_v0`
WHERE artifact_url IN ({stringify_array(repo_urls)})
ORDER BY star_count DESC
"""

results = client.query(query)
df_repos = results.to_dataframe()
df_repos.tail(5)

Unnamed: 0,artifact_url,star_count,fork_count,language
1726,https://github.com/silentcicero/is-hex-prefixed,0,1,JavaScript
1727,https://github.com/prysmaticlabs/prombbolt,0,0,Starlark
1728,https://github.com/wealdtech/go-eth2-util,0,4,Go
1729,https://github.com/wealdtech/go-bytesutil,0,0,Go
1730,https://github.com/anacrolix/upnp,0,2,Go


# Simulate Pairwise comparisons and create an ELO rating

- First, we organize funding decisions into quarterly groups and compares projects within the same round and quarter based on their relative funding amounts. This allows for a direct comparison of projects that were evaluated at the same time.
- Next, we use a basic ELO model and assign a rating for each project
- Finally, we run the analysis for web2 and web3 projects separately

In [6]:
def pairwise_simulation(dataframe):

    data = []
    
    funding_rounds = dataframe.groupby(['from_project_name', 'quarter'])['to_project_name'].nunique()
    funding_rounds = funding_rounds[funding_rounds > 1]
    
    for funder, quarter in funding_rounds.keys():
        dff = dataframe[(dataframe['from_project_name'] == funder) & (dataframe['quarter'] == quarter)]
        projects = list(dff['to_project_name'].unique())
        comparisons = combinations(projects, 2)
        for (project_a,project_b) in comparisons:
            amount_a = dff[dff['to_project_name'] == project_a]['total_funding_usd'].sum()
            amount_b = dff[dff['to_project_name'] == project_b]['total_funding_usd'].sum()
            repo_a = dff[dff['to_project_name'] == project_a]['git_repo_with_most_stars'].unique()[0]
            repo_b = dff[dff['to_project_name'] == project_b]['git_repo_with_most_stars'].unique()[0]
            amount_total = amount_a + amount_b
            data.append({
                'funder': funder,
                'quarter': quarter,
                'project_a': repo_a,
                'project_b': repo_b,
                'weight_a': amount_a / amount_total,
                'weight_b': amount_b / amount_total
            })

    return pd.DataFrame(data)

df_pairwise = pairwise_simulation(df)
df_pairwise.tail()

Unnamed: 0,funder,quarter,project_a,project_b,weight_a,weight_b
25214,optimism,2024-10-01 00:00:00+00:00,https://github.com/vyperlang/vyper,https://github.com/bluealloy/revm,0.478621,0.521379
25215,optimism,2024-10-01 00:00:00+00:00,https://github.com/vyperlang/vyper,https://github.com/ethereum/solc-js,0.633212,0.366788
25216,optimism,2024-10-01 00:00:00+00:00,https://github.com/libp2p/go-libp2p,https://github.com/bluealloy/revm,0.744457,0.255543
25217,optimism,2024-10-01 00:00:00+00:00,https://github.com/libp2p/go-libp2p,https://github.com/ethereum/solc-js,0.845647,0.154353
25218,optimism,2024-10-01 00:00:00+00:00,https://github.com/bluealloy/revm,https://github.com/ethereum/solc-js,0.65285,0.34715


In [7]:
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating_a, rating_b, score_a, k_factor):
    expected_a = expected_score(rating_a, rating_b)
    rating_a_new = rating_a + k_factor * (score_a - expected_a)
    rating_b_new = rating_b + k_factor * ((1 - score_a) - (1 - expected_a))
    return rating_a_new, rating_b_new
    
def elo_simulation(dataframe):
    
    elo_ratings = defaultdict(lambda: 1500) # Default ELO rating for all projects
    appearances = defaultdict(int)
    
    for _,row in dataframe.iterrows():
        project_a = row['project_a']
        project_b = row['project_b']
        share_a = row['weight_a']
        score_a = 1 if share_a > 0.5 else 0 if share_a < 0.5 else 0.5

        k_a = 40 / (1 + appearances[project_a] / 5)
        k_b = 40 / (1 + appearances[project_b] / 5)
        k_factor = (k_a + k_b) / 2
        
        elo_ratings[project_a], elo_ratings[project_b] = update_elo(
            elo_ratings[project_a], elo_ratings[project_b], score_a, k_factor
        )
        appearances[project_a] += 1
        appearances[project_b] += 1
    
    margin_of_error = {project: 400 / np.sqrt(appearances[project]) for project in appearances}

    return (
        pd.DataFrame([
            {
                'project': project,
                'head-to-head_comparisons': appearances[project],        
                'elo_rating': rating,        
                'margin_of_error': margin_of_error[project]
            }
            for project, rating in elo_ratings.items()
        ])
        .sort_values(by='elo_rating', ascending=False)
        .set_index('project', drop=True)
    )

In [8]:
web3 = elo_simulation(df_pairwise[df_pairwise['funder'] != 'opencollective'])
web3_projects = list(web3.index)
web3.head()

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://github.com/ethers-io/ethers.js,197,1699.417075,28.49882
https://github.com/prysmaticlabs/prysm,118,1690.587302,36.822985
https://github.com/sigp/lighthouse,207,1663.348353,27.801922
https://github.com/libp2p/go-libp2p,36,1609.626286,66.666667
https://github.com/wevm/viem,119,1573.109505,36.66794


In [9]:
web2 = elo_simulation(
    df_pairwise[(~df_pairwise['project_a'].isin(web3_projects)) & (~df_pairwise['project_b'].isin(web3_projects))]
)
web2.head()

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://github.com/webpack/webpack,1084,1860.673622,12.149135
https://github.com/babel/babel,1077,1854.420825,12.188553
https://github.com/vuejs/vue,1071,1802.685274,12.222647
https://github.com/eslint/eslint,1001,1783.548472,12.642791
https://github.com/mochajs/mocha,1088,1688.257094,12.126781


In [10]:
df_pairwise.to_csv('../../../pairwise_sim_results.csv')