In [1]:
from uuid import uuid4

from rs_graph.data import load_repo_contributors_dataset, load_author_contributions_dataset
from rs_graph.modeling import load_ready_to_use_author_dev_em_clf
import pandas as pd
from tqdm import tqdm

# Load the two datasets
devs = load_repo_contributors_dataset()
authors = load_author_contributions_dataset()

print(devs.shape)
print(authors.shape)

KeyboardInterrupt: 

In [None]:
# Take sample of authors
authors = authors.sample(100)

# Create dataframe of only authors and a list of all co_authors
processed_authors = []
for _, author in authors.iterrows():
    co_authors = []
    repos = []
    for contribution in author.contributions:
        co_authors.extend(contribution["co_authors"])
        repos.append(contribution["repo"])

    # Add to processed authors
    processed_authors.append({
        "author_id": author.author_id if author.author_id else str(uuid4()),
        "name": author["name"],
        "repos": repos,
        "co_authors": co_authors,
    })

processed_authors = pd.DataFrame(processed_authors)
processed_authors

Unnamed: 0,author_id,name,repos,co_authors
0,2097140834,José Jiménez,[https://github.com/hawk31/pyGPGO],[J. Ginebra]
1,2106322298,Davide Montero,[https://github.com/davemlz/eemont],[]
2,2097109889,Yves Mocquard,[https://github.com/ymocquar/HOODESolver.jl],"[P. Navaro, N. Crouseilles]"
3,91723951,Stepan Fomichev,[https://github.com/matthewcarbone/GGCE],"[Matthew R. Carbone, A. Millis, M. Berciu, D. ..."
4,2070539595,Apoorva Pandey,[https://github.com/ropensci/rdataretriever],"[Henry Senyondo, D. McGlinn, Pranita Sharma, D..."
...,...,...,...,...
95,16099092,Yida Chen,[https://github.com/KALMUS-Color-Toolkit/KALMUS],"[Eric Faden, Nathan C. Ryan]"
96,50169183,L. Kelly,[https://github.com/kellylab/Fireworks],[Saad Khan]
97,29857471,J. Rozo,[https://github.com/sunpy/sunpy],"[S. Mumford, N. Freij, S. Christe, J. Ireland,..."
98,1401907635,Jaime Rodríguez-Guerra,[https://github.com/volkamerlab/opencadd],"[Dominique Sydow, Andrea Volkamer]"


In [None]:
# Create dataframe of dev username, name, email, repos, and co_contributors
processed_devs = []
for dev, group in devs.groupby("username"):
    repos = []
    co_contributors = []
    for _, row in group.iterrows():
        repos.append(row["repo"])
        co_contributors.extend(row["co_contributors"])
    
    processed_devs.append({
        "username": dev,
        "name": group.iloc[0]["name"],
        "email": group.iloc[0]["email"],
        "repos": repos,
        "co_contributors": co_contributors,
    })

processed_devs = pd.DataFrame(processed_devs)
processed_devs

Unnamed: 0,username,name,email,repos,co_contributors
0,0-tree,,,[https://github.com/PyWavelets/pywt],"[grlee77, rgommers, kwohlfahrt, nigma, aaren, ..."
1,0003088,,,[https://github.com/ElektraInitiative/libelektra],"[sanssecours, mpranj, kodebach, markus2330, at..."
2,001ben,,,[https://github.com/ropensci/rtweet],"[llrs, mkearney, hadley, hrbrmstr, thomas-kell..."
3,00krishna,Krishna Bhogaonker,,[https://github.com/SciML/GlobalSensitivity.jl],"[Vaibhavdixit02, ChrisRackauckas, ArnoStrouwen..."
4,01gaunte,Timmy,,[https://github.com/WMD-group/SMACT/],"[AntObi, a-ws-m, keeeto, dandavies99, dependab..."
...,...,...,...,...,...
11751,zygoloid,Richard Smith,,[https://github.com/ajohnson-uoregon/llvm-proj...,"[lattner, topperc, RKSimon, espindola, tkremen..."
11752,zyi103,Zijun yi,zyi103@syr.edu,[https://github.com/titipata/pubmed_parser],"[titipata, raypereda-gr, daniel-acuna, bluenex..."
11753,zzawadz,Zygmunt Zawadzki,zygmunt@zstat.pl,[https://github.com/mlr-org/mlr3],"[mllg, pat-s, be-marc, sebffischer, web-flow, ..."
11754,zzilch,Zeyu Huang,zeyuhuang97@gmail.com,[https://github.com/DLR-RM/BlenderProc],"[themasterlink, cornerfarmer, MartinSmeyer, id..."


In [None]:
# Create lookup tables for repos to authors and repos to devs
repo_to_author_lut = {}
for _, author in processed_authors.iterrows():
    for repo in author.repos:
        if repo not in repo_to_author_lut:
            repo_to_author_lut[repo] = []
        repo_to_author_lut[repo].append(author)

repo_to_dev_lut = {}
for _, dev in processed_devs.iterrows():
    for repo in dev.repos:
        if repo not in repo_to_dev_lut:
            repo_to_dev_lut[repo] = []
        repo_to_dev_lut[repo].append(dev)

# For each author, find devs with the same repo
data = []
for repo, authors in repo_to_author_lut.items():
    devs = repo_to_dev_lut.get(repo, [])
    for author in authors:
        for dev in devs:
            data.append({
                "repo": repo,
                "author_details": author,
                "dev_details": dev,
            })

data = pd.DataFrame(data)
data

Unnamed: 0,repo,author_details,dev_details
0,https://github.com/hawk31/pyGPGO,author_id 20971408...,username ...
1,https://github.com/hawk31/pyGPGO,author_id 20971408...,username ...
2,https://github.com/hawk31/pyGPGO,author_id 20971408...,username josejime...
3,https://github.com/davemlz/eemont,author_id 2106322...,username ...
4,https://github.com/davemlz/eemont,author_id 2106322...,username ...
...,...,...,...
845,https://github.com/volkamerlab/opencadd,author_id 1...,username ...
846,https://github.com/volkamerlab/opencadd,author_id 1...,username ...
847,https://github.com/volkamerlab/opencadd,author_id 1...,username ...
848,https://github.com/volkamerlab/opencadd,author_id 1...,username ...


In [None]:
data.iloc[0].author_details

author_id                             2097140834
name                                José Jiménez
repos         [https://github.com/hawk31/pyGPGO]
co_authors                          [J. Ginebra]
Name: 0, dtype: object

In [None]:
data.iloc[0].dev_details

username                                       Saizor
name                                           Anders
email                                            None
repos              [https://github.com/hawk31/pyGPGO]
co_contributors                [josejimenezluna, dfm]
Name: 1979, dtype: object

In [8]:
# Load the classifier and apply to dataframe
clf = load_ready_to_use_author_dev_em_clf()
matched_authors_to_devs = []
for _, row in tqdm(
    data.iterrows(),
    total=len(data),
):
    # Pass to classifier
    match_value = clf(
        row.author_details,
        row.dev_details,
    )

    # Add to list
    matched_authors_to_devs.append({
        "repo": row.repo,
        "author_details": row.author_details,
        "dev_details": row.dev_details,
        "match": match_value,
    })

matched_authors_to_devs = pd.DataFrame(matched_authors_to_devs)
matched_authors_to_devs

100%|██████████| 850/850 [01:10<00:00, 12.03it/s]


Unnamed: 0,repo,author_details,dev_details,match
0,https://github.com/hawk31/pyGPGO,author_id 20971408...,username ...,match
1,https://github.com/hawk31/pyGPGO,author_id 20971408...,username ...,match
2,https://github.com/hawk31/pyGPGO,author_id 20971408...,username josejime...,match
3,https://github.com/davemlz/eemont,author_id 2106322...,username ...,match
4,https://github.com/davemlz/eemont,author_id 2106322...,username ...,no-match
...,...,...,...,...
845,https://github.com/volkamerlab/opencadd,author_id 1...,username ...,match
846,https://github.com/volkamerlab/opencadd,author_id 1...,username ...,match
847,https://github.com/volkamerlab/opencadd,author_id 1...,username ...,match
848,https://github.com/volkamerlab/opencadd,author_id 1...,username ...,match


In [10]:
for _, row in matched_authors_to_devs.sample(10).iterrows():
    print(row.repo)
    print()
    print(row.author_details)
    print()
    print(row.dev_details)
    print()
    print("MATCH:", row.match)
    print()
    print()
    print("-" * 80)
    print()

https://github.com/sunpy/sunpy

author_id                                            2105821916
name                                                  R. Mishra
repos                          [https://github.com/sunpy/sunpy]
co_authors    [S. Mumford, N. Freij, S. Christe, J. Ireland,...
Name: 28, dtype: object

username                                                   Punyaslok
name                                              Punyaslok Pattnaik
email                                                           None
repos                               [https://github.com/sunpy/sunpy]
co_contributors    [Cadair, nabobalis, dstansby, ehsteve, ayshih,...
Name: 1819, dtype: object

MATCH: match


--------------------------------------------------------------------------------

https://github.com/lucydot/effmass

author_id                               2136644040
name                               Lucy D. Whalley
repos         [https://github.com/lucydot/effmass]
co_authors                   