In [1]:
from collections import defaultdict, Counter
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

sys.path.append(os.path.abspath("../.."))
from utils.validate_weights import validate_weights
from utils.serialize_graph import convert_graph_to_serializable

# Load the unweighted graph

In [2]:
with open('../../graph/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
print("Nodes:", len(G.nodes))
repo_urls = [x for x in G.nodes]
print("Edges:", len(G.edges))

Nodes: 4303
Edges: 9896


In [3]:
validate_weights(G)

{'https://github.com/ethereum/solidity': 0.0,
 'https://github.com/vyperlang/vyper': 0.0,
 'https://github.com/ethereum/py-evm': 0.0,
 'https://github.com/ethereum/web3.py': 0.0,
 'https://github.com/paradigmxyz/reth': 0.0,
 'https://github.com/consensys/teku': 0.0,
 'https://github.com/prysmaticlabs/prysm': 0.0,
 'https://github.com/erigontech/erigon': 0.0,
 'https://github.com/web3/web3.js': 0.0,
 'https://github.com/grandinetech/grandine': 0.0,
 'https://github.com/chainsafe/lodestar': 0.0,
 'https://github.com/sigp/lighthouse': 0.0,
 'https://github.com/ethereum/go-ethereum': 0.0,
 'https://github.com/status-im/nimbus-eth2': 0.0,
 'https://github.com/ethereum/remix-project': 0.0,
 'https://github.com/safe-global/safe-smart-account': 0.0,
 'https://github.com/eth-infinitism/account-abstraction': 0.0,
 'https://github.com/ethereumjs/ethereumjs-monorepo': 0.0}

In [4]:
relations = [data['relation'] for _, _, data in G.edges(data=True) if 'relation' in data]
pkg_relation_counts = Counter(relations)
pkg_relation_counts

Counter({'NPM': 7814, 'RUST': 1349, 'GO': 598, 'PIP': 135})

# Grab GitHub event data from all relevant repos

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the query below and export it to a Parquet file
- Warning: this is >100 GB scan

In [5]:
from google.cloud import bigquery

# replace with your path to credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'

# replace with your project name
client = bigquery.Client(project='opensource-observer')

In [6]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
WITH user_commit_counts AS (
    SELECT
        users.artifact_source_id AS git_user_id,
        COUNT(DISTINCT EXTRACT(YEAR FROM events.time)) AS active_years,
        COUNT(*) AS total_commits
    FROM `oso_production.timeseries_events_by_artifact_v0` AS events
    JOIN `oso_production.artifacts_v1` AS users
      ON events.from_artifact_id = users.artifact_id
    WHERE users.artifact_name NOT LIKE '%[bot]%'
    GROUP BY users.artifact_source_id
    HAVING 
        active_years >= 2
        AND total_commits >= 10
)
SELECT
    EXTRACT(YEAR FROM events.time) AS year,
    events.event_type,
    users.artifact_source_id AS git_user_id,
    repos.artifact_url AS git_repo,
    SUM(events.amount) AS amount
FROM `oso_production.timeseries_events_by_artifact_v0` AS events
JOIN `oso_production.repositories_v0` AS repos
    ON events.to_artifact_id = repos.artifact_id
JOIN `oso_production.artifacts_v1` AS users
    ON events.from_artifact_id = users.artifact_id
JOIN user_commit_counts AS filtered_users
    ON users.artifact_source_id = filtered_users.git_user_id
WHERE
    repos.artifact_url IN ({stringify_array(repo_urls)})
GROUP BY 1, 2, 3, 4
"""

results = client.query(query)
dataframe = results.to_dataframe()
dataframe.to_parquet('../../datasets/oso/github_events.parquet')

dataframe = pd.read_parquet('../../datasets/oso/github_events.parquet')
dataframe.tail(1)

Unnamed: 0,year,event_type,git_user_id,git_repo,amount
666495,2024,ISSUE_COMMENT,382183,https://github.com/yarnpkg/berry,1.0


In [7]:
print("Users:", dataframe['git_user_id'].nunique())
print("Indexed Repos:", dataframe['git_repo'].nunique())

Users: 113236
Indexed Repos: 786


In [8]:
dataframe.groupby('event_type')['amount'].sum()

event_type
COMMIT_CODE                     192680.0
FORKED                           99860.0
ISSUE_CLOSED                    152122.0
ISSUE_COMMENT                  1150326.0
ISSUE_OPENED                    124535.0
ISSUE_REOPENED                    7446.0
PULL_REQUEST_CLOSED             236648.0
PULL_REQUEST_MERGED             177137.0
PULL_REQUEST_OPENED             217284.0
PULL_REQUEST_REOPENED             5049.0
PULL_REQUEST_REVIEW_COMMENT     553608.0
RELEASE_PUBLISHED                10243.0
STARRED                         488372.0
Name: amount, dtype: float64

# Apply a basic weighting algorithm

In [9]:
# Constants
total_weight_cap = 0.8
max_edge_weight = 0.2

fork_weight = 1.0
star_weight = 0.5
other_weight = 0.25

# Step 1: Pre-calculate contributions for all nodes
contributions = dataframe[dataframe['event_type'] == 'COMMIT_CODE'].groupby('git_repo')['git_user_id'].unique()

# Step 2: Iterate over seed nodes at level 1
for seed_node, node_data in G.nodes(data=True):
    
    if node_data.get('level') != 1:
        continue

    # Get contributors for the seed node
    contributors = contributions.get(seed_node, np.array([]))

    # Filter dataframe once for relevant dependents and contributors
    relevant_deps = dataframe[
        dataframe['git_user_id'].isin(contributors) & 
        (dataframe['git_repo'].isin(G.successors(seed_node)))
    ]

    # Step 3: Calculate raw weights using groupby and vectorized operations
    weight_df = (
        relevant_deps
        .groupby(['git_repo', 'event_type'])['git_user_id']
        .nunique()
        .unstack(fill_value=0)
    )
    
    weight_df['FORKED'] = weight_df.get('FORKED', 0) * fork_weight
    weight_df['STARRED'] = weight_df.get('STARRED', 0) * star_weight
    weight_df['OTHER'] = (weight_df.sum(axis=1) - weight_df[['FORKED', 'STARRED']].sum(axis=1)) * other_weight
    
    weight_df['raw_weight'] = weight_df[['FORKED', 'STARRED', 'OTHER']].sum(axis=1)
    
    # Step 4: Normalize raw weights
    total_raw_weight = weight_df['raw_weight'].sum()
    if total_raw_weight > 0:
        weight_df['normalized_weight'] = (weight_df['raw_weight'] / total_raw_weight) * total_weight_cap
    else:
        weight_df['normalized_weight'] = 0.0

    # Step 5: Cap weights and calculate remaining weight
    weight_df['capped_weight'] = weight_df['normalized_weight'].clip(upper=max_edge_weight)
    remaining_weight = total_weight_cap - weight_df['capped_weight'].sum()

    # Step 6: Redistribute remaining weight proportionally to uncapped edges
    uncapped_mask = weight_df['normalized_weight'] < max_edge_weight
    uncapped_total = weight_df.loc[uncapped_mask, 'capped_weight'].sum()

    if uncapped_total > 0:
        additional_weight = (weight_df['capped_weight'] / uncapped_total) * remaining_weight
        weight_df['final_weight'] = np.minimum(weight_df['capped_weight'] + additional_weight, max_edge_weight)
    else:
        weight_df['final_weight'] = weight_df['capped_weight']

    # Step 7: Assign weights back to edges
    for dep, weight in weight_df['final_weight'].items():
        if G.has_edge(seed_node, dep):
            G[seed_node][dep]['weight'] = weight

In [10]:
for seed_node in G.nodes():
    outgoing_edges = [(v, G[seed_node][v]['weight']) for v in G.successors(seed_node) 
                      if 'weight' in G[seed_node][v]]
    sorted_edges = sorted(outgoing_edges, key=lambda x: x[1], reverse=True)
    if sorted_edges:
        total_weights = sum([x[1] for x in sorted_edges])
        print(f"\nSeed Node: {seed_node} | Weight: {1 - total_weights:.4f}")
        for i, (dep, weight) in enumerate(sorted_edges):
            if i < 10:
                print(f"- Dependent: {dep} | Weight: {weight:.4f}")


Seed Node: https://github.com/ethereum/solidity | Weight: 1.0000
- Dependent: https://github.com/taminomara/sphinx-a4doc | Weight: 0.0000
- Dependent: https://github.com/sphinx-doc/sphinx | Weight: 0.0000
- Dependent: https://github.com/readthedocs/sphinx_rtd_theme | Weight: 0.0000

Seed Node: https://github.com/vyperlang/vyper | Weight: 1.0000
- Dependent: https://github.com/pypa/wheel | Weight: 0.0000
- Dependent: https://github.com/pypa/packaging | Weight: 0.0000
- Dependent: https://github.com/gristlabs/asttokens | Weight: 0.0000
- Dependent: https://github.com/lepture/shibuya | Weight: 0.0000
- Dependent: https://github.com/agronholm/cbor2 | Weight: 0.0000
- Dependent: https://github.com/legrandin/pycryptodome | Weight: 0.0000
- Dependent: https://github.com/python/importlib_metadata | Weight: 0.0000
- Dependent: https://github.com/executablebooks/sphinx-copybutton | Weight: 0.0000
- Dependent: https://github.com/sphinx-doc/sphinx | Weight: 0.0000

Seed Node: https://github.com/e

# Export the graph to JSON

In [11]:
print("\nLevel 1 Nodes (Sources) and Their Summed Edge Weights:")
source_weights = validate_weights(G)
pd.Series(source_weights)


Level 1 Nodes (Sources) and Their Summed Edge Weights:


https://github.com/ethereum/solidity                     0.000000
https://github.com/vyperlang/vyper                       0.000000
https://github.com/ethereum/py-evm                       0.200000
https://github.com/ethereum/web3.py                      0.800000
https://github.com/paradigmxyz/reth                      0.800000
https://github.com/consensys/teku                        0.200000
https://github.com/prysmaticlabs/prysm                   0.800000
https://github.com/erigontech/erigon                     0.800000
https://github.com/web3/web3.js                          0.800000
https://github.com/grandinetech/grandine                 0.200000
https://github.com/chainsafe/lodestar                    0.800000
https://github.com/sigp/lighthouse                       0.771429
https://github.com/ethereum/go-ethereum                  0.684211
https://github.com/status-im/nimbus-eth2                 0.400000
https://github.com/ethereum/remix-project                0.776471
https://gi

In [12]:
G_serializable = convert_graph_to_serializable(G)
graph_json = nx.node_link_data(G_serializable)
output_path = "../../graph/weighting_examples/oso_p2p_contributions_weighting.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)