In [1]:
import polars as pl
from sci_soft_models.dev_author_em import (
    DeveloperDetails,
    load_dev_author_em_model,
    match_devs_and_authors,
)
from sklearn.metrics import classification_report, cohen_kappa_score
from tqdm import tqdm

annotations = pl.read_csv("author-developer-annotated-pairs-resolved.csv")

# Drop unnamed columns if they exist
annotations = annotations.drop("")

annotations

dataset_source_name,document_repository_link_id,document_id,document_url,repository_id,repository_url,researcher_id,developer_account_id,developer_url,author_name,developer_username,developer_name,developer_email,eva_label,nic_label,eva_notes,nic_notes,final_label
str,i64,i64,str,i64,str,i64,i64,str,str,str,str,str,str,str,str,str,str
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31944,23049,"""https://github.com/kmkaur""","""Katrina M. Kaur""","""kmkaur""","""Katrina Kaur""",,"""match""","""match""",,,"""match"""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31945,23049,"""https://github.com/kmkaur""","""Pierre‐Jean G. Malé""","""kmkaur""","""Katrina Kaur""",,"""no-match""","""no-match""",,,"""no-match"""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31946,23049,"""https://github.com/kmkaur""","""E. J. Spence""","""kmkaur""","""Katrina Kaur""",,"""no-match""","""no-match""",,,"""no-match"""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31947,23049,"""https://github.com/kmkaur""","""Crisanto Gómez""","""kmkaur""","""Katrina Kaur""",,"""no-match""","""no-match""",,,"""no-match"""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31948,23049,"""https://github.com/kmkaur""","""Megan E. Frederickson""","""kmkaur""","""Katrina Kaur""",,"""no-match""","""no-match""",,,"""no-match"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""softwarex""",163221,157240,"""https://doi.org/10.1016/j.soft…",153520,"""https://github.com/usccacs/rxm…",236687,168459,"""https://github.com/manaschai""","""Pankaj Rajak""","""manaschai""",,,"""no-match""","""no-match""",,,"""no-match"""
"""softwarex""",163227,157246,"""https://doi.org/10.1016/J.SOFT…",153526,"""https://github.com/mihiranpath…",225204,168468,"""https://github.com/mihiranpath…","""Faisal Khan""","""mihiranpathmika""",,,"""no-match""","""no-match""",,,"""no-match"""
"""softwarex""",163227,157246,"""https://doi.org/10.1016/J.SOFT…",153526,"""https://github.com/mihiranpath…",318176,168468,"""https://github.com/mihiranpath…","""Mihiran Galagedarage Don""","""mihiranpathmika""",,,"""match""","""match""",,,"""match"""
"""softwarex""",163269,157288,"""https://doi.org/10.1016/J.SOFT…",153566,"""https://github.com/konjkov/qmc…",318280,168560,"""https://github.com/Konjkov""","""Vladimir Konkov""","""Konjkov""","""Vladimir""",,"""match""","""match""",,,"""match"""


In [2]:
# Show rows where eva_label is not equal to nic_label
annotations.filter(pl.col("eva_label") != pl.col("nic_label"))

dataset_source_name,document_repository_link_id,document_id,document_url,repository_id,repository_url,researcher_id,developer_account_id,developer_url,author_name,developer_username,developer_name,developer_email,eva_label,nic_label,eva_notes,nic_notes,final_label
str,i64,i64,str,i64,str,i64,i64,str,str,str,str,str,str,str,str,str,str
"""plos""",9379,9364,"""https://doi.org/10.1371/journa…",9065,"""https://github.com/jkimlab/ima…",36144,24394,"""https://github.com/jkimlab""","""Jaebum Kim""","""jkimlab""",,,"""unsure""","""no-match""","""from text information alone, n…",,"""no-match"""
"""plos""",9906,9891,"""https://doi.org/10.1371/journa…",9586,"""https://github.com/evanslabora…",38886,25088,"""https://github.com/jezreel-me""","""Jezreel Pantaleón García""","""jezreel-me""",,,"""unsure""","""no-match""","""likely the correct match but d…",,"""no-match"""
"""plos""",9906,9891,"""https://doi.org/10.1371/journa…",9586,"""https://github.com/evanslabora…",38898,25087,"""https://github.com/EvansLabora…","""Scott E. Evans""","""EvansLaboratory""",,,"""unsure""","""no-match""","""likely the correct account but…",,"""no-match"""
"""plos""",44324,43895,"""https://doi.org/10.1371/journa…",41946,"""https://github.com/ohi-science…",38000,24894,"""https://github.com/Melsteroni""","""Melanie Frazier""","""Melsteroni""",,,"""no-match""","""match""","""out of all of them this one mi…",,"""no-match"""
"""pwc""",155536,150095,"""https://doi.org/10.48550/arxiv…",146357,"""https://github.com/sandeep8294…",308335,162932,"""https://github.com/MohitSahu-M…","""M.P. SAHU""","""MohitSahu-MS""",,,"""match""","""unsure""",,"""burner""","""match"""
"""pwc""",155536,150095,"""https://doi.org/10.48550/arxiv…",146357,"""https://github.com/sandeep8294…",308335,162933,"""https://github.com/Mohit-coder…","""M.P. SAHU""","""Mohit-coder-droid""",,,"""no-match""","""match""",,,"""match"""


In [3]:
# Get cohen kappa score of eva_label and nic_label
cohen_kappa_score(
    annotations["eva_label"],
    annotations["nic_label"],
)

0.9481262423037766

In [4]:
# Load the author developer EM model
dev_author_em_model = load_dev_author_em_model()

# Create predictions for each doc repo pair
predicted_rows = []
for _, group in tqdm(
    annotations.drop(["eva_notes", "nic_notes"]).group_by(
        "document_repository_link_id"
    ),
    total=annotations["document_repository_link_id"].n_unique(),
):
    # Iter rows in group
    for row in group.iter_rows(named=True):
        developer_details = DeveloperDetails(
            username=row["developer_username"],
            name=row["developer_name"],
            email=row["developer_email"],
        )
        this_row_predictions = match_devs_and_authors(
            devs=[developer_details],
            authors=[row["author_name"]],
            loaded_dev_author_em_model=dev_author_em_model,
        )
        predicted_rows.append(
            {
                **row,
                "predicted_label": (
                    "match" if len(this_row_predictions) > 0 else "no-match"
                ),
            }
        )

# Convert predicted_rows to a polars dataframe
predicted_df = pl.DataFrame(predicted_rows)

# Drop rows where final label is null
predicted_df = predicted_df.filter(pl.col("final_label").is_not_null())

# Show rows where final_label is not equal to predicted_label
predicted_df.filter(pl.col("final_label") != pl.col("predicted_label"))

Device set to use mps
100%|██████████| 56/56 [00:19<00:00,  2.80it/s]


dataset_source_name,document_repository_link_id,document_id,document_url,repository_id,repository_url,researcher_id,developer_account_id,developer_url,author_name,developer_username,developer_name,developer_email,eva_label,nic_label,final_label,predicted_label
str,i64,i64,str,i64,str,i64,i64,str,str,str,str,str,str,str,str,str
"""pwc""",155536,150095,"""https://doi.org/10.48550/arxiv…",146357,"""https://github.com/sandeep8294…",308335,162933,"""https://github.com/Mohit-coder…","""M.P. SAHU""","""Mohit-coder-droid""",,,"""no-match""","""match""","""match""","""no-match"""
"""softwarex""",162951,156975,"""https://doi.org/10.1016/j.soft…",153259,"""https://github.com/maxspe-dtu/…",317334,167818,"""https://github.com/maxspe-dtu""","""Max Spencer""","""maxspe-dtu""",,,"""match""","""match""","""match""","""no-match"""
"""pwc""",60657,59871,"""https://doi.org/10.48550/arxiv…",57257,"""https://github.com/palvj/saddl…",170737,88762,"""https://github.com/palVJ""","""Pål Vegard Johnsen""","""palVJ""",,,"""match""","""match""","""match""","""no-match"""
"""softwarex""",163227,157246,"""https://doi.org/10.1016/J.SOFT…",153526,"""https://github.com/mihiranpath…",318176,168468,"""https://github.com/mihiranpath…","""Mihiran Galagedarage Don""","""mihiranpathmika""",,,"""match""","""match""","""match""","""no-match"""
"""softwarex""",163141,157160,"""https://doi.org/10.1016/j.soft…",153441,"""https://github.com/phadjido/di…",317910,168276,"""https://github.com/nkarathan""","""Nikolaos Karathanasopoulos""","""nkarathan""",,,"""match""","""match""","""match""","""no-match"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""plos""",9379,9364,"""https://doi.org/10.1371/journa…",9065,"""https://github.com/jkimlab/ima…",36137,24394,"""https://github.com/jkimlab""","""Juyeon Kim""","""jkimlab""",,,"""no-match""","""no-match""","""no-match""","""match"""
"""plos""",9379,9364,"""https://doi.org/10.1371/journa…",9065,"""https://github.com/jkimlab/ima…",36144,24394,"""https://github.com/jkimlab""","""Jaebum Kim""","""jkimlab""",,,"""unsure""","""no-match""","""no-match""","""match"""
"""softwarex""",162960,156984,"""https://doi.org/10.1016/j.soft…",153268,"""https://github.com/mrpt/mvsim""",317354,167855,"""https://github.com/Julopvel""","""Ángel López-Gázquez""","""Julopvel""","""Julián López Velásquez""",,"""no-match""","""no-match""","""no-match""","""match"""
"""plos""",9906,9891,"""https://doi.org/10.1371/journa…",9586,"""https://github.com/evanslabora…",38898,25087,"""https://github.com/EvansLabora…","""Scott E. Evans""","""EvansLaboratory""",,,"""unsure""","""no-match""","""no-match""","""match"""


In [5]:
# Print basic details of each row
for row in predicted_df.filter(
    pl.col("final_label") != pl.col("predicted_label")
).iter_rows(named=True):
    print("Author:", row["author_name"])
    print(
        "Developer:",
        row["developer_username"],
        row["developer_name"],
        row["developer_email"],
    )
    print("Final label:", row["final_label"])
    print("Predicted label:", row["predicted_label"])
    print()

Author: M.P. SAHU
Developer: Mohit-coder-droid None None
Final label: match
Predicted label: no-match

Author: Max Spencer
Developer: maxspe-dtu None None
Final label: match
Predicted label: no-match

Author: Pål Vegard Johnsen
Developer: palVJ None None
Final label: match
Predicted label: no-match

Author: Mihiran Galagedarage Don
Developer: mihiranpathmika None None
Final label: match
Predicted label: no-match

Author: Nikolaos Karathanasopoulos
Developer: nkarathan None None
Final label: match
Predicted label: no-match

Author: Chloe Brimicombe
Developer: chlobrim Chloe None
Final label: match
Predicted label: no-match

Author: Kemal Kurniawan
Developer: kemal1056949 None None
Final label: match
Predicted label: no-match

Author: Katie Major-Smith
Developer: djsmith-90 None None
Final label: no-match
Predicted label: match

Author: Juyeon Kim
Developer: jkimlab None None
Final label: no-match
Predicted label: match

Author: Jaebum Kim
Developer: jkimlab None None
Final label: no-mat

In [6]:
# Get the classification report for final_label and predicted_label
print(
    classification_report(
        predicted_df["final_label"],
        predicted_df["predicted_label"],
    ),
)

              precision    recall  f1-score   support

       match       0.92      0.87      0.89        63
    no-match       0.98      0.99      0.99       472

    accuracy                           0.98       535
   macro avg       0.95      0.93      0.94       535
weighted avg       0.98      0.98      0.98       535



In [7]:
# Iter over dataset_source_name and get classification report for each
for dataset_source_name, group in predicted_df.group_by("dataset_source_name"):
    print(f"Classification report for {dataset_source_name}")
    print(
        classification_report(
            group["final_label"],
            group["predicted_label"],
        ),
    )
    print()

Classification report for ('plos',)
              precision    recall  f1-score   support

       match       0.79      1.00      0.88        15
    no-match       1.00      0.98      0.99       188

    accuracy                           0.98       203
   macro avg       0.89      0.99      0.94       203
weighted avg       0.98      0.98      0.98       203


Classification report for ('softwarex',)
              precision    recall  f1-score   support

       match       0.96      0.85      0.90        27
    no-match       0.98      0.99      0.99       187

    accuracy                           0.98       214
   macro avg       0.97      0.92      0.94       214
weighted avg       0.98      0.98      0.98       214


Classification report for ('pwc',)
              precision    recall  f1-score   support

       match       1.00      0.81      0.89        21
    no-match       0.96      1.00      0.98        97

    accuracy                           0.97       118
   macro avg  