In [1]:
from collections import Counter
import json
import networkx as nx
import numpy as np
import pandas as pd
import re
from oso import fetch_data

# Load the unweighted graph

In [2]:
def check_weight(g):
    return sum(
        data['weight']
        for _, _, data in g.edges(data=True)
    )

def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

In [3]:
with open('data/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
check_weight(G)

0

In [4]:
len(G.edges)

9873

In [5]:
relations = [data['relation'] for _, _, data in G.edges(data=True) if 'relation' in data]
pkg_relation_counts = Counter(relations)
pkg_relation_counts

Counter({'NPM': 7825, 'RUST': 1351, 'GO': 561, 'PIP': 136})

# Grab some basic GitHub stats

In [6]:
ignore_list = ['facebook', 'huggingface', 'webpack', 'babel', 'pandas-dev', 'pnpm', 'eslint', 'numpy']
owners = set()
for node in G.nodes:
    match = re.match(r"https://github\.com/([^/]+)", node)
    if match:
        owner = match.group(1)
        if owner not in ignore_list:
            owners.add(owner)
len(owners)

1730

In [7]:
repo_metrics_query = f"""
    select distinct * except(project_id, artifact_source)
    from `oso.int_repo_metrics_by_project`
    where artifact_namespace in ({stringify_array(owners)})
"""

df_repo_metrics = fetch_data(repo_metrics_query, 'data/archive/repo_metrics.csv', connect_to_oso=True)
df_repo_metrics['github_url'] = df_repo_metrics.apply(
    lambda x: f"https://github.com/{x['artifact_namespace']}/{x['artifact_name']}", axis=1
)
df_repo_metrics.tail()

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,created_at,updated_at,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count,github_url
4008,6fxr8I3LEWIJbAn4FMC4H9lC9FOy2_iIRVEexGio82I=,prysmaticlabs,prysm-testnet-site,False,21,4,4,TypeScript,,2019-02-15 17:13:18+00:00,2024-01-31 06:25:45+00:00,2019-03-18 15:04:07+00:00,2020-11-04 20:15:10+00:00,42,8,248.0,https://github.com/prysmaticlabs/prysm-testnet...
4009,IJoXe4rw355BlweIwdS4FCc_4wmBuZG3b8lEIfl8ngI=,walletconnect,verify-server,False,3,7,7,HCL,MIT,2023-01-23 20:27:03+00:00,2024-10-01 03:08:32+00:00,2023-01-23 20:42:48+00:00,2024-06-27 17:03:37+00:00,49,8,193.0,https://github.com/walletconnect/verify-server
4010,27atwT4IUa3Q5bHROXMU93w38KWR3gCuRTcrLFY5cBA=,walletconnect,notify-server,False,6,20,20,Rust,MIT,2023-07-26 14:01:09+00:00,2024-11-19 07:08:43+00:00,2023-07-26 17:05:19+00:00,2024-05-17 17:53:21+00:00,129,8,742.0,https://github.com/walletconnect/notify-server
4011,tmL0p2YwVc0I7JkkSuJkibr-iUQSkmHp2I2a1Rw6NhA=,walletconnect,actions,False,1,10,10,,,2022-06-22 22:07:15+00:00,2024-10-13 05:17:13+00:00,2022-06-22 22:12:48+00:00,2024-10-09 14:15:43+00:00,37,8,59.0,https://github.com/walletconnect/actions
4012,1Jus8lqY4Kushe1BiBrNqu_Kv6TmOE5wimRqTTQANss=,walletconnect,keys-server,False,8,15,15,HCL,MIT,2022-06-16 10:13:09+00:00,2024-10-13 05:38:46+00:00,2022-07-07 07:28:38+00:00,2024-10-09 10:46:01+00:00,69,8,216.0,https://github.com/walletconnect/keys-server


# Apply a basic (naive) weighting algorithm

In [8]:
repo_star_map = {
    row['github_url']: row['star_count']
    for _, row in df_repo_metrics.iterrows()
}

def calculate_edge_weight(source_stars, target_stars, relation_count):
    EPSILON = 1e-6
    log_source = np.log1p(source_stars)
    log_target = np.log1p(target_stars)
    harmonic_mean = (2 * log_source * log_target) / (log_source + log_target)
    weight = harmonic_mean / (relation_count + EPSILON)
    return weight

for source_repo, target_repo, data in G.edges(data=True):
    source_stars = repo_star_map.get(source_repo, 0)
    target_stars = repo_star_map.get(target_repo, 0)
    relation_count = pkg_relation_counts.get(data['relation'], 1)  # avoid div by zero
    data['weight'] = calculate_edge_weight(source_stars, target_stars, relation_count)
    
total_weight = check_weight(G)
for u, v, data in G.edges(data=True):
    data['weight'] /= total_weight    

In [9]:
weighted_degree = {}
for node in G.nodes:
    weighted_degree[node] = sum(
        data['weight']
        for _, v, data
        in G.edges(node, data=True)
    )
    
node_metrics = pd.DataFrame({
    'Node': list(weighted_degree.keys()),
    'Weighted Degree': list(weighted_degree.values()),
    'Level': [G.nodes[node].get('level',2) for node in G.nodes]
})

top_nodes_by_level = (
    node_metrics.groupby('Level', group_keys=False)
    .apply(lambda x: x.sort_values(by='Weighted Degree', ascending=False).head(20))
)    

top_nodes_by_level

Unnamed: 0,Node,Weighted Degree,Level
2,https://github.com/erigontech/erigon,0.265547,1
0,https://github.com/prysmaticlabs/prysm,0.258717,1
3,https://github.com/ethereum/web3.py,0.164212,1
11,https://github.com/paradigmxyz/reth,0.067886,1
12,https://github.com/sigp/lighthouse,0.058792,1
1,https://github.com/ethereum/go-ethereum,0.057506,1
7,https://github.com/ethereum/py-evm,0.032635,1
13,https://github.com/ethereum/remix-project,0.022731,1
10,https://github.com/grandinetech/grandine,0.022188,1
9,https://github.com/web3/web3.js,0.014102,1


In [10]:
graph_json = nx.node_link_data(G)
output_path = "data/example_weighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)

# Demo of how to grab OSO raw event data

In [11]:
artifact_ids = df_repo_metrics['artifact_id'].unique()
len(artifact_ids)

4013

In [12]:
# get all GitHub activity to the repos we care about

event_query = f"""
    select
      date_trunc(time, MONTH) as event_month,
      from_artifact_name as git_user,
      to_artifact_namespace as git_org,
      to_artifact_name as git_repo,
      event_type,
      sum(amount) as amount
    from `oso.int_events__github`
    where
        to_artifact_id in ({stringify_array(artifact_ids)})
        and time >= '2020-01-01'
    group by 1,2,3,4,5
"""

df_events = fetch_data(event_query, 'data/archive/events.parquet', connect_to_oso=True)
df_events.tail()

Unnamed: 0,event_month,git_user,git_org,git_repo,event_type,amount
1242275,2022-07-01 00:00:00+00:00,julianosilva94,nomicfoundation,hardhat,STARRED,1.0
1242276,2022-07-01 00:00:00+00:00,danielattilasimon,nomicfoundation,hardhat-vscode,ISSUE_COMMENT,2.0
1242277,2024-12-01 00:00:00+00:00,galargh,nomicfoundation,hardhat,PULL_REQUEST_CLOSED,1.0
1242278,2022-12-01 00:00:00+00:00,0xfreeman,nomicfoundation,hardhat,STARRED,1.0
1242279,2022-05-01 00:00:00+00:00,pdyraga,nomicfoundation,hardhat,ISSUE_COMMENT,2.0


In [13]:
print("Repos:", len(artifact_ids))
print("Git Users:", df_events['git_user'].nunique())
df_events.groupby('event_type')['amount'].sum()

Repos: 4013
Git Users: 288610


event_type
COMMIT_CODE                    540432.0
FORKED                         201579.0
ISSUE_CLOSED                   128122.0
ISSUE_COMMENT                  878610.0
ISSUE_OPENED                   220186.0
ISSUE_REOPENED                   4537.0
PULL_REQUEST_CLOSED            386065.0
PULL_REQUEST_MERGED            317562.0
PULL_REQUEST_OPENED            391817.0
PULL_REQUEST_REOPENED            3363.0
PULL_REQUEST_REVIEW_COMMENT    533785.0
RELEASE_PUBLISHED               30204.0
STARRED                        454491.0
Name: amount, dtype: float64