In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd
from urllib.parse import urlparse

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'
client = bigquery.Client()

In [3]:
with open("data/FIL_RetroPGF1_applications.json", "r") as f:
    projects = json.load(f)

In [4]:
# query = """
# select * from `opensource-observer.oso.int_repo_metrics_by_project`
# """
# result = client.query(query)
# REPOS = result.to_dataframe()
REPOS = pd.read_csv("/Users/cerv1-air/Downloads/bquxjob_30727923_18fbbe086f4.csv")
REPOS['repo_name_with_owner'] = REPOS.apply(lambda x: x['artifact_namespace'] + '/' + x['artifact_name'], axis=1)
REPOS.head(1)

Unnamed: 0,project_id,artifact_id,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,repo_name_with_owner
0,JGeDyNHH6QCFwyTjHD4pnu6ZY-fiEhk31FoqmEzCZXc=,rlL-L08is_caHSQVQS_J6RrfH4gan1vBehjB0UEkin4=,wanchain,explore-wanchain,False,8,6,6,CSS,Apache-2.0,2020-11-17 04:54:47.000000 UTC,2023-06-07 05:20:29.000000 UTC,33.0,4.0,wanchain/explore-wanchain


In [5]:
REPOS_TO_ARTIFACT_IDS = dict(zip(REPOS['repo_name_with_owner'].str.lower(), REPOS['artifact_id']))
REPOS_TO_IDS = dict(zip(REPOS['repo_name_with_owner'].str.lower(), REPOS['project_id']))
OWNERS = REPOS[['artifact_namespace', 'project_id']].drop_duplicates()
OWNERS_TO_IDS = dict(zip(OWNERS['artifact_namespace'].str.lower(), OWNERS['project_id']))

In [19]:
def process_github(url):
    url = url.strip('/').strip()
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.strip('/').split('/')
    if 'orgs' in path_parts:
        return path_parts[1].lower()
    elif len(path_parts) >= 2:
        owner, repo = path_parts[0], path_parts[1]
        return f"{owner}/{repo}".lower()
    elif len(path_parts) == 1:
        return path_parts[0].lower()
    else:
        return None

def map_to_oso(github):
    if not isinstance(github, str):
        return None
    elif '/' in github:
        return REPOS_TO_IDS.get(github)
    else:
        return OWNERS_TO_IDS.get(github)

In [28]:
github_links = []
data = []
sankey_data = []
artifact_ids = []

for p in projects:
    app = p['app']
    links = app['contributionLinks']
    
    githubs = list(set([process_github(x['url']) for x in links if 'github.com' in x['url']]))
    github_links.extend(githubs)

    r = REPOS[REPOS['repo_name_with_owner'].isin(githubs) | REPOS['artifact_namespace'].isin(githubs)]
    if len(r) > 1:
        r = r[r['star_count'] >= 10]
    github_repos_oso = sorted(list(r['repo_name_with_owner'].unique()))
    artifact_ids.extend(list(r['artifact_id']))

    
    data.append({
        'id': p['id'],
        'name': p['name'],
        'bio': app['bio'],
        'contribution': app['contributionDescription'],
        'impact': app['impactDescription'],        
        'category': app['impactCategory'],
        'github_links_app': githubs,
        'github_repos_oso': github_repos_oso,
        'github_repos_count': len(github_repos_oso),
        'fork_count': r['fork_count'].sum(),
        'star_count': r['star_count'].sum(),
        'first_commit': r['first_commit_time'].dropna().min(),
        'last_commit': r['last_commit_time'].dropna().max()
    })
    
    for repo in github_repos_oso:
        sankey_data.append({
            'project_applicant': p['name'],
            'github_org': repo.split('/')[0],
            'github_repo': repo,
            'artifact_id': REPOS_TO_ARTIFACT_IDS.get(repo)
        })
        
artifact_ids = list(set(artifact_ids))
    
len(data)

106

In [29]:
REPOS[REPOS['artifact_id'].isin(artifact_ids)].to_csv("data/FIL_RetroPGF1_repo_snapshot.csv")

In [30]:
len(artifact_ids)
artifacts_ids_str = "'" + "','".join(artifact_ids) + "'"

In [31]:
query = f"""
select *
from `opensource-observer.oso.int_events_daily_to_project`
where
    to_artifact_id in ({artifacts_ids_str})
    and event_type = 'COMMIT_CODE'
    and bucket_day >= '2021-01-01'
"""
result = client.query(query)
artifact_data = result.to_dataframe()

In [32]:
artifact_data.to_csv('data/FIL_RetroPGF1_commits.csv')

In [33]:
artifact_data.tail(1)

Unnamed: 0,project_id,from_artifact_id,to_artifact_id,event_source,event_type,bucket_day,amount
20146,FRm_4WZ2s9f4JOO6BqAEiYa2Qe9KU6LWBnC2uPSMUpI=,lstvFeR07thQG6lQUxlylRFR4kDHE7wAYYXNdhxAzAU=,yInel_bC6f4BeZ6QUrltz1puJvQxKmI9qUGSohQeCf0=,GITHUB,COMMIT_CODE,2024-04-23 00:00:00+00:00,1.0


In [34]:
a = artifact_data[artifact_data['event_type'] == 'COMMIT_CODE'].copy()

a['year'] = a['bucket_day'].apply(lambda x: x.year)
a['month'] = a['bucket_day'].apply(lambda x: x.month)

mads = a.groupby(['from_artifact_id', 'year', 'month', 'to_artifact_id'])['bucket_day'].nunique().reset_index()
mads = mads[mads['bucket_day'] >= 3]

devs = mads.groupby('to_artifact_id')['from_artifact_id'].count()
devs

to_artifact_id
-8UKPG8lT9DEAf6rDGHGoHNhN7DQTEXMZQwY64wxgAE=      8
-d4ItIxU4u72nE7Fllx6fev2gVvVNAmJA8FxAOhImJ8=      5
0LObUd0agU0_I4eCZ5PjaAa5uiuKHBgKQlW4oKjHJGU=      6
16V8WB9Rs_WGb0qcBTLWlqHT6WcLLRrn917SnFJj0PM=     10
1gvafhm6GqCqEidh467LU-FOB7CfQWRtfOm9s5tM0rM=      5
                                               ... 
y6RPxCQQczIHHXH4DWB1jMbgNXzVv3fEX1JCvtV3q_c=      7
y7E_pBqUub6t4UDmk2MmeO_4UD8hAJLieYhD3ls5sbk=     19
yInel_bC6f4BeZ6QUrltz1puJvQxKmI9qUGSohQeCf0=    129
z04uiGvTjGFCIVj-bCO6HLQdL-M9s2vIKfpUd41hbhQ=      1
z5RIyGXBvqG1EyvsvTNo67ohG-dCdfMuD0c3rYssCNo=      8
Name: from_artifact_id, Length: 173, dtype: int64

In [39]:
df1 = REPOS[REPOS['repo_name_with_owner'].isin(project_repos)]
df2 = REPOS[REPOS['artifact_namespace'].isin(project_orgs)]
df = pd.concat([df1, df2], axis=0, ignore_index=True).drop_duplicates()
df

Unnamed: 0,project_id,artifact_id,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,repo_name_with_owner
0,FRm_4WZ2s9f4JOO6BqAEiYa2Qe9KU6LWBnC2uPSMUpI=,fyWb_5WlHrUyHO9qNKeF0o9fo5SPMwJFqc5ysCe4z5U=,filecoin-project,community,False,164,472,472,,NOASSERTION,2019-02-14 20:16:56.000000 UTC,2022-10-31 11:25:20.000000 UTC,66.0,16.0,filecoin-project/community
1,ogxNhIfrVaknEjgkLw_x3MDkMWhFxIOxPE-dTugDrJo=,kvlbm0sQK2mMpHQNdj4Kq5ChDsYUucPMA0SZfQwukKU=,filswan,go-swan-client,False,17,18,18,Go,,2021-10-19 16:01:28.000000 UTC,2023-03-08 08:47:04.000000 UTC,48.0,6.0,filswan/go-swan-client
2,BaZZmxNfgooKGgv-pndm7EIOfdf7AZoY00MGQ26duvI=,QufHNcdAbZOAq69c_G-kpuN3nb5eWB2UMXOqwWYGAT0=,consensus-shipyard,ipc,False,25,28,28,Rust,Apache-2.0,2023-12-06 14:38:52.000000 UTC,2024-05-22 11:34:04.000000 UTC,86.0,10.0,consensus-shipyard/ipc
3,gRpsb_h_DILsv5I2bZ80GgfDOabbU8lWY9yjiyMSe0A=,c2kEx5wGDFOfZFtWdrjYNoMrvYSrberhN0uxgeQSL0U=,drand,drand,False,107,707,707,Go,NOASSERTION,2017-09-10 16:28:14.000000 UTC,2024-04-24 13:12:49.000000 UTC,310.0,15.0,drand/drand
4,IpWllC86rHt2tqK6f384F8mvBUde0Y0cjd_cV2l96Io=,ND0mLlGJh7vG-G5YLG-Iq2k0ggsHJKIju-xIsZrLaUU=,filecoin-station,spark,False,1,4,4,JavaScript,NOASSERTION,2023-05-16 08:34:20.000000 UTC,2024-05-16 10:10:54.000000 UTC,29.0,2.0,filecoin-station/spark
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,_-S_4OPQhIH4TID_hmmAA5OXTOHKI4iN4pvDgt3elQk=,bNLwAm_O0UZW5a2k_IjZWTPqiq203ztf5UihkmjhWo8=,ipfs,go-ds-sql,False,19,31,31,Go,MIT,2016-03-12 00:39:29.000000 UTC,2024-03-22 07:18:41.000000 UTC,20.0,10.0,ipfs/go-ds-sql
1014,DIFwzDKrLDsrfbdzXMd0V-K852SAS7sR0loKq0bIlco=,RJQ_6PuVTTbbEMJagAbw0ViNA9ZfDm8TWW3aSS87h6Y=,libp2p,go-libp2p-asn-util,False,7,3,3,Go,MIT,2020-09-18 17:59:34.000000 UTC,2024-03-22 12:52:56.000000 UTC,20.0,8.0,libp2p/go-libp2p-asn-util
1015,_-S_4OPQhIH4TID_hmmAA5OXTOHKI4iN4pvDgt3elQk=,SzKKn5EzwMbmPdV8qbGv_kC0IuUhv0KWi-BfwyfTKW4=,ipfs,go-bitfield,False,2,6,6,Go,NOASSERTION,2018-03-30 01:14:35.000000 UTC,2024-03-22 07:18:55.000000 UTC,20.0,6.0,ipfs/go-bitfield
1016,DIFwzDKrLDsrfbdzXMd0V-K852SAS7sR0loKq0bIlco=,j7Fxo1mfesfH9b32-NzgMy9tXnXMrD_BbjbQPUo6nfY=,libp2p,zeroconf,True,14,16,16,Go,NOASSERTION,2021-08-11 20:11:50.000000 UTC,2024-03-22 07:17:16.000000 UTC,20.0,4.0,libp2p/zeroconf


In [42]:
(
    df
    .set_index('name')
    .join(
        votes
        .groupby('project')['amount']
        .apply(list)
    )
    .to_csv("data/FIL_RetroPGF1_projects_with_votes.csv")
)