In [1]:
from skops import io as sk_io
from sentence_transformers import SentenceTransformer
from rs_graph.modeling import DEFAULT_AUTHOR_DEV_EMBEDDING_MODEL_NAME, AUTHOR_DEV_EM_MODEL_PATH

eva_github = {
    "username": "evamaxfield",
    "name": "Eva Maxfield Brown",
    "email": "None",
}
eva_author = {
    "name": "Eva Maxfield Brown",
}
nic_github = {
    "username": "nniiicc",
    "name": "Nic Weber",
    "email": "None",
}

# get pairwise embeddings
embedder = SentenceTransformer(DEFAULT_AUTHOR_DEV_EMBEDDING_MODEL_NAME)

# get embeddings
def get_joint_interaction_embedding(dev, author):
    joint_interaction_embeddings = []
    for _, dev_value in dev.items():
        for _, author_value in author.items():
            encode_dev_value = embedder.encode(str(dev_value))
            encode_author_value = embedder.encode(str(author_value))
            interaction = (encode_dev_value * encode_author_value).tolist()
            joint_interaction_embeddings.extend(interaction)

    return joint_interaction_embeddings

eva_eva_embedding = get_joint_interaction_embedding(eva_github, eva_author)
nic_eva_embedding = get_joint_interaction_embedding(nic_github, eva_author)

# load clf
clf = sk_io.load(AUTHOR_DEV_EM_MODEL_PATH)
clf.predict([eva_eva_embedding, nic_eva_embedding])



array(['match', 'no-match'], dtype=object)

In [2]:
from rs_graph.data import load_repo_contributors_dataset, load_author_contributions_dataset
import pandas as pd
from tqdm import tqdm

# Load the two datasets
devs = load_repo_contributors_dataset()
authors = load_author_contributions_dataset().sample(100)

print(devs.shape)
print(authors.shape)

(15731, 8)
(100, 4)


In [3]:
processed_authors = []
repo_to_author_id_lut = {}
for _, author in authors.iterrows():
    repos = [c["repo"] for c in author.contributions]

    for repo in repos:
        if repo not in repo_to_author_id_lut:
            repo_to_author_id_lut[repo] = []

        repo_to_author_id_lut[repo].append(author.author_id)

    processed_authors.append({
        "name": author["name"],
        "author_id": author.author_id,
    })

processed_authors = pd.DataFrame(processed_authors)
processed_authors.sample(3)

Unnamed: 0,name,author_id
90,Bardhyl Miftari,2118314761
56,S. Tam,123091049
72,Rebecca Bilbro,52361036


In [4]:
# Create dataframe of dev username, name, email, repos, and co_contributors
processed_devs = []
repo_to_dev_lut = {}
for dev, group in devs.groupby("username"):
    repos = group["repo"].tolist()

    for repo in repos:
        if repo not in repo_to_dev_lut:
            repo_to_dev_lut[repo] = []

        repo_to_dev_lut[repo].append(dev)

    processed_devs.append({
        "username": dev,
        "name": group.iloc[0]["name"],
        "email": group.iloc[0]["email"],
    })

processed_devs = pd.DataFrame(processed_devs)
processed_devs.sample(3)

Unnamed: 0,username,name,email
11221,vincentme,,
8066,mchestnut91,Michael Chestnut,
10712,teddykoker,Teddy Koker,teddy.koker@gmail.com


In [5]:
# Create comparison dataframe of devs and authors in the same repo
repo_and_dev_author_comparisons = []
for repo, dev_usernames in tqdm(repo_to_dev_lut.items()):
    if repo not in repo_to_author_id_lut:
        continue

    author_ids = repo_to_author_id_lut[repo]

    for dev_username in dev_usernames:
        for author_id in author_ids:
            dev_details = processed_devs.loc[
                processed_devs["username"] == dev_username
            ].iloc[0].to_dict()
            author_details = processed_authors.loc[
                processed_authors["author_id"] == author_id
            ].iloc[0].to_dict()

            # Only keep name
            author_details = {
                "name": author_details["name"],
            }

            # Append
            repo_and_dev_author_comparisons.append({
                "repo": repo,
                "dev_details": dev_details,
                "author_details": author_details,
            })

repo_and_dev_author_comparisons = pd.DataFrame(repo_and_dev_author_comparisons)
repo_and_dev_author_comparisons.sample(3)

100%|██████████| 2610/2610 [00:01<00:00, 1865.77it/s]


Unnamed: 0,repo,dev_details,author_details
259,https://github.com/aliaksei135/seedpod_ground_...,"{'username': 'Zach10a', 'name': 'Zach Tait', '...",{'name': 'Aliaksei Pilko'}
361,https://github.com/skypyproject/skypy,"{'username': 'spxiwh', 'name': 'Ian Harry', 'e...",{'name': 'S. Tam'}
750,https://github.com/spacetx/starfish,"{'username': 'xchang1', 'name': 'Xian Chang', ...",{'name': 'Justin Kiggins'}


In [6]:
# get embeddings for each comparison row and store in a new column
embeddings = []
for _, row in tqdm(
    repo_and_dev_author_comparisons.iterrows(),
    total=len(repo_and_dev_author_comparisons),
):
    embeddings.append(
        get_joint_interaction_embedding(
            row["dev_details"],
            row["author_details"],
        )
    )

repo_and_dev_author_comparisons["embeddings"] = embeddings
repo_and_dev_author_comparisons.sample(3)

100%|██████████| 850/850 [00:57<00:00, 14.87it/s]


Unnamed: 0,repo,dev_details,author_details,embeddings
46,https://github.com/mdolab/pygeo,"{'username': 'anilyil', 'name': 'Anil Yildirim...",{'name': 'Benjamin J. Brelje'},"[0.0010458321776241064, 0.001172031625173986, ..."
51,https://github.com/mdolab/pygeo,"{'username': 'denera', 'name': 'Alp Dener', 'e...",{'name': 'Benjamin J. Brelje'},"[0.001371104153804481, 0.0009955003624781966, ..."
224,https://github.com/Jammy2211/PyAutoLens,"{'username': 'linan7788626', 'name': 'Nan Li',...",{'name': 'Xiaoyue Cao'},"[0.013032550923526287, -0.0002859474625438452,..."


In [7]:
# Apply clf to each row using the embeddings
repo_and_dev_author_comparisons["prediction"] = clf.predict(
    repo_and_dev_author_comparisons["embeddings"].tolist(),
)
repo_and_dev_author_comparisons.sample(10)



Unnamed: 0,repo,dev_details,author_details,embeddings,prediction
835,https://github.com/julie-forman-kay-lab/SPyCi-PDB,"{'username': 'menoliu', 'name': 'Zi Hao (Nemo)...",{'name': 'João M. C. Teixeira'},"[-0.001166783389635384, 0.004040050320327282, ...",no-match
820,https://github.com/TRIP-Lab/itinerum-mobile-api,"{'username': 'kafitz', 'name': 'Kyle', 'email'...",{'name': 'Z. Patterson'},"[0.0024591111578047276, -0.0001711060758680105...",no-match
10,https://github.com/DLR-RM/BlenderProc,"{'username': '5trobl', 'name': 'Klaus Strobl',...",{'name': 'Maximilian Denninger'},"[-0.0007630615727975965, -0.000696146918926388...",no-match
574,https://github.com/whimian/pyGeoPressure,"{'username': 'whimian', 'name': 'Yu Hao', 'ema...",{'name': 'Yu Hao'},"[-0.00040632972377352417, 0.000987027189694345...",no-match
455,https://github.com/wright-group/WrightTools,"{'username': 'kameyer226', 'name': None, 'emai...",{'name': 'Kyle F Sunden'},"[0.0017073467606678605, -0.0003124477807432413...",no-match
159,https://github.com/OSC/ondemand,"{'username': 'Micket', 'name': 'Mikael Öhman',...",{'name': 'Douglas Johnson'},"[0.0010567855788394809, -0.0002820800291374326...",no-match
748,https://github.com/spacetx/starfish,"{'username': 'sofroniewn', 'name': 'Nicholas S...",{'name': 'Justin Kiggins'},"[0.00018926772463601083, 0.0004172940098214894...",no-match
194,https://github.com/elabftw/elabftw,"{'username': 'NicolasCARPi', 'name': 'Nicolas ...",{'name': 'A. Minges'},"[0.0013839086750522256, 0.002384957391768694, ...",no-match
359,https://github.com/skypyproject/skypy,"{'username': 'sibirrer', 'name': 'Simon Birrer...",{'name': 'S. Tam'},"[-0.0005976097891107202, 0.0009286386775784194...",no-match
169,https://github.com/OSC/ondemand,"{'username': 'dpavlos', 'name': 'Pavlos Daoglo...",{'name': 'Douglas Johnson'},"[0.00682052131742239, -0.0005334896850399673, ...",no-match


In [8]:
repo_and_dev_author_comparisons["prediction"].value_counts()

prediction
no-match    803
match        47
Name: count, dtype: int64

In [9]:
repo_and_dev_author_comparisons.loc[
    repo_and_dev_author_comparisons.prediction == "match"
]

Unnamed: 0,repo,dev_details,author_details,embeddings,prediction
35,https://github.com/DLR-RM/BlenderProc,"{'username': 'themasterlink', 'name': 'Maximil...",{'name': 'Maximilian Denninger'},"[0.004624851979315281, -0.00017695077985990793...",match
47,https://github.com/mdolab/pygeo,"{'username': 'bbrelje', 'name': 'Ben Brelje', ...",{'name': 'Benjamin J. Brelje'},"[0.0007520371000282466, 0.0031003979966044426,...",match
100,https://github.com/ArneTillmann/AuDoLab,"{'username': 'AFThielmann', 'name': 'Anton Thi...",{'name': 'Arne Tillmann'},"[-0.0009755536448210478, 0.0001271639775950461...",match
141,https://github.com/n8thangreen/BCEA/,"{'username': 'annaheath', 'name': 'Anna Heath'...",{'name': 'Anna Heath'},"[0.000702054938301444, 0.0017483840929344296, ...",match
247,https://github.com/vivo-project/VIVO,"{'username': 'lawlesst', 'name': 'Ted Lawless'...",{'name': 'Ted Lawless'},"[0.0015835268422961235, 0.003231290727853775, ...",match
260,https://github.com/aliaksei135/seedpod_ground_...,"{'username': 'aliaksei135', 'name': 'Aliaksei ...",{'name': 'Aliaksei Pilko'},"[0.002625127090141177, 0.0030059432610869408, ...",match
263,https://github.com/gibbonCode/GIBBON,"{'username': 'Kevin-Mattheus-Moerman', 'name':...",{'name': 'K. Moerman'},"[0.007855313830077648, 0.0008876232313923538, ...",match
285,https://github.com/MRN-Code/coinstac,"{'username': 'pixelsaurus', 'name': 'Javier Ro...",{'name': 'Javier Romero'},"[0.006404371000826359, -0.0009087863145396113,...",match
286,https://github.com/MRN-Code/coinstac,"{'username': 'rochaeb', 'name': 'Eduardo Rocha...",{'name': 'Javier Romero'},"[0.00269471132196486, -0.0006351429619826376, ...",match
307,https://github.com/TRI-AMDD/beep,"{'username': 'patrickherring-TRI', 'name': 'Pa...",{'name': 'Patrick K. Herring'},"[0.0026571021880954504, 0.0038378718309104443,...",match
