# QSS Revisions -- Author-Developer Pairs Annotation for Generalization

Ask from a reviewer to evaluate the author-developer matching model using author-developer pairs from non-JOSS datasets.

In [None]:
import polars as pl

from rs_graph.db import constants as db_constants

RANDOM_SEED = 12
N_DOC_REPO_PAIRS_PER_DATASET_SOURCE = 20

###############################################################################

def _read_table(table: str) -> pl.DataFrame: 
    return pl.read_database_uri(
        f"SELECT * FROM {table}",
        f"sqlite:///{db_constants.PROD_DATABASE_FILEPATH}",
    )

def _generate_annotation_set(
    random_seed: int = RANDOM_SEED,
    n_pairs_per_dataset_source: int = N_DOC_REPO_PAIRS_PER_DATASET_SOURCE,
) -> pl.DataFrame:
    # Read all the tables we need
    docs = _read_table("document")
    repos = _read_table("repository")
    dataset_sources = _read_table("dataset_source")
    pairs = _read_table("document_repository_link")

    # Drop to unique doc and unique repo in pairs
    pairs = pairs.join(
        dataset_sources.select(
            pl.col("id").alias("dataset_source_id"),
            pl.col("name").alias("dataset_source_name"),
        ),
        on="dataset_source_id",
        how="left",
    ).filter(
        # Not JOSS
        pl.col("dataset_source_name").str.to_lowercase().eq("joss").not_()
    ).unique(
        subset="document_id",
        keep="none",
    ).unique(
        subset="repository_id",
        keep="none",
    )

    # Join the tables to get the positive examples
    hydrated_pairs = (
        pairs.select(
            pl.col("id").alias("document_repository_link_id"),
            "document_id",
            "repository_id",
            "dataset_source_name",
        )
        .join(
            docs.select(
                pl.col("id").alias("document_id"),
                (pl.lit("https://doi.org/") + pl.col("doi")).alias("document_url"),
            ),
            on="document_id",
            how="left",
        )
        .join(
            repos.select(
                pl.col("id").alias("repository_id"),
                (
                    pl.lit("https://github.com/")
                    + pl.col("owner")
                    + pl.lit("/")
                    + pl.col("name")
                ).alias("repository_url"),
            ),
            on="repository_id",
            how="left",
        )
    )

    # Now we need to get the authors for any document in this set
    document_contributors = _read_table("document_contributor")
    researchers = _read_table("researcher")
    hydrated_authors = document_contributors.filter(
        pl.col("document_id").is_in(hydrated_pairs["document_id"].to_list())
    ).join(
        researchers.select(
            pl.col("id").alias("researcher_id"),
            pl.col("name").alias("author_name"),
        ),
        on="researcher_id",
        how="left",
    ).select(
        pl.col("document_id"),
        pl.col("researcher_id"),
        pl.col("author_name"),
    )

    # Now we need to get the contributors for any repository in this set
    repository_contributors = _read_table("repository_contributor")
    developer_accounts = _read_table("developer_account")
    hydrated_developers = repository_contributors.filter(
        pl.col("repository_id").is_in(hydrated_pairs["repository_id"].to_list())
    ).join(
        developer_accounts.select(
            pl.col("id").alias("developer_account_id"),
            pl.col("username").alias("developer_username"),
            pl.col("name").alias("developer_name"),
            pl.col("email").alias("developer_email"),
        ),
        on="developer_account_id",
        how="left",
    ).select(
        pl.col("repository_id"),
        pl.col("developer_account_id"),
        (
            pl.lit("https://github.com/") + pl.col("developer_username")
        ).alias("developer_url"),
        pl.col("developer_username"),
        pl.col("developer_name"),
        pl.col("developer_email"),
    )

    # Take a sample of document-repository pairs from each dataset source
    sampled_author_developer_pairs_for_annotation = []
    for dataset_source_name in pairs["dataset_source_name"].unique():
        dataset_pairs = hydrated_pairs.filter(
            pl.col("dataset_source_name") == dataset_source_name
        ).sample(
            n=n_pairs_per_dataset_source,
            seed=random_seed,
        )

        # Iter over each row in the sampled pairs
        for row in dataset_pairs.iter_rows(named=True):
            # Get the authors for the document
            authors = hydrated_authors.filter(
                pl.col("document_id") == row["document_id"]
            ).select(
                pl.col("researcher_id"),
                pl.col("author_name"),
            )

            # Get the developers for the repository
            developers = hydrated_developers.filter(
                pl.col("repository_id") == row["repository_id"]
            ).select(
                pl.col("developer_account_id"),
                pl.col("developer_url"),
                pl.col("developer_username"),
                pl.col("developer_name"),
                pl.col("developer_email"),
            )

            # Create all combinations of authors and developers to add for annotation
            for author in authors.iter_rows(named=True):
                for developer in developers.iter_rows(named=True):
                    sampled_author_developer_pairs_for_annotation.append(
                        {
                            "dataset_source_name": dataset_source_name,
                            "document_repository_link_id": row["document_repository_link_id"],
                            "document_id": row["document_id"],
                            "document_url": row["document_url"],
                            "repository_id": row["repository_id"],
                            "repository_url": row["repository_url"],
                            "researcher_id": author["researcher_id"],
                            "developer_account_id": developer["developer_account_id"],
                            "developer_url": developer["developer_url"],
                            "author_name": author["author_name"],
                            "developer_username": developer["developer_username"],
                            "developer_name": developer["developer_name"],
                            "developer_email": developer["developer_email"],
                        }
                    )

    # Convert the list of dictionaries to a Polars DataFrame
    sampled_author_developer_pairs_for_annotation_df = pl.DataFrame(
        sampled_author_developer_pairs_for_annotation
    ).sort(
        by=[
            pl.col("dataset_source_name"),
            pl.col("document_id"),
            pl.col("repository_id"),
            pl.col("researcher_id"),
            pl.col("developer_account_id"),
        ]
    )

    # Add a column for "annotated_label" initialized to ""
    # Add a column for "annotation_notes" initialized to ""
    sampled_author_developer_pairs_for_annotation_df = sampled_author_developer_pairs_for_annotation_df.with_columns(
        pl.lit("").alias("annotated_label"),
        pl.lit("").alias("annotation_notes"),
    )

    # Save the DataFrame to a CSV file
    # Include the random seed and number of pairs per dataset source in the filename
    output_filename = (
        f"author_developer_pairs_for_annotation_seed_{random_seed}_"
        f"n_pairs_{n_pairs_per_dataset_source}.csv"
    )
    sampled_author_developer_pairs_for_annotation_df.write_csv(
        output_filename,
    )

    # Get simple stats about the dataset
    # mean +- std number of rows per dataset source
    # median number of rows per dataset source
    # mean +- std number of rows per document_repository_link
    # median number of rows per document_repository_link
    pairs_per_dataset_source = (
        sampled_author_developer_pairs_for_annotation_df
        .group_by("dataset_source_name")
        .agg(pl.len().alias("n_rows"))
    )
    mean_n_rows_per_dataset_source = pairs_per_dataset_source["n_rows"].mean()
    std_n_rows_per_dataset_source = pairs_per_dataset_source["n_rows"].std()
    median_n_rows_per_dataset_source = pairs_per_dataset_source["n_rows"].median()

    pairs_per_document_repository_link = (
        sampled_author_developer_pairs_for_annotation_df
        .group_by("document_repository_link_id")
        .agg(pl.len().alias("n_rows"))
    )
    mean_n_rows_per_document_repository_link = (
        pairs_per_document_repository_link["n_rows"].mean()
    )
    std_n_rows_per_document_repository_link = (
        pairs_per_document_repository_link["n_rows"].std()
    )
    median_n_rows_per_document_repository_link = (
        pairs_per_document_repository_link["n_rows"].median()
    )

    # Print the stats
    print(
        f"Total number of author-developer pairs for annotation: "
        f"{len(sampled_author_developer_pairs_for_annotation_df)}\n"
        f"Dataset Source Stats:\n"
        f"Mean: {mean_n_rows_per_dataset_source:.2f} "
        f"(± {std_n_rows_per_dataset_source:.2f}), "
        f"Median: {median_n_rows_per_dataset_source:.2f}\n"
        f"Document Repository Link Stats:\n"
        f"Mean: {mean_n_rows_per_document_repository_link:.2f} "
        f"(± {std_n_rows_per_document_repository_link:.2f}), "
        f"Median: {median_n_rows_per_document_repository_link:.2f}"
    )

    return sampled_author_developer_pairs_for_annotation_df

In [2]:
_generate_annotation_set(
    n_pairs_per_dataset_source=20,
)

Dataset Source Stats:
Mean: 178.33 (± 52.54), Median: 203.00
Document Repository Link Stats:
Mean: 9.55 (± 13.88), Median: 4.50


dataset_source_name,document_repository_link_id,document_id,document_url,repository_id,repository_url,researcher_id,developer_account_id,developer_url,author_name,developer_username,developer_name,developer_email,annotated_label,annotation_notes
str,i64,i64,str,i64,str,i64,i64,str,str,str,str,str,str,str
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31944,23049,"""https://github.com/kmkaur""","""Katrina M. Kaur""","""kmkaur""","""Katrina Kaur""",,"""""",""""""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31945,23049,"""https://github.com/kmkaur""","""Pierre‐Jean G. Malé""","""kmkaur""","""Katrina Kaur""",,"""""",""""""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31946,23049,"""https://github.com/kmkaur""","""E. J. Spence""","""kmkaur""","""Katrina Kaur""",,"""""",""""""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31947,23049,"""https://github.com/kmkaur""","""Crisanto Gómez""","""kmkaur""","""Katrina Kaur""",,"""""",""""""
"""plos""",8545,8530,"""https://doi.org/10.1371/journa…",8243,"""https://github.com/kmkaur/coop…",31948,23049,"""https://github.com/kmkaur""","""Megan E. Frederickson""","""kmkaur""","""Katrina Kaur""",,"""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""softwarex""",163221,157240,"""https://doi.org/10.1016/j.soft…",153520,"""https://github.com/usccacs/rxm…",236687,168459,"""https://github.com/manaschai""","""Pankaj Rajak""","""manaschai""",,,"""""",""""""
"""softwarex""",163227,157246,"""https://doi.org/10.1016/J.SOFT…",153526,"""https://github.com/mihiranpath…",225204,168468,"""https://github.com/mihiranpath…","""Faisal Khan""","""mihiranpathmika""",,,"""""",""""""
"""softwarex""",163227,157246,"""https://doi.org/10.1016/J.SOFT…",153526,"""https://github.com/mihiranpath…",318176,168468,"""https://github.com/mihiranpath…","""Mihiran Galagedarage Don""","""mihiranpathmika""",,,"""""",""""""
"""softwarex""",163269,157288,"""https://doi.org/10.1016/J.SOFT…",153566,"""https://github.com/konjkov/qmc…",318280,168560,"""https://github.com/Konjkov""","""Vladimir Konkov""","""Konjkov""","""Vladimir""",,"""""",""""""


In [3]:
_generate_annotation_set(
    n_pairs_per_dataset_source=10,
)

Dataset Source Stats:
Mean: 110.33 (± 74.57), Median: 75.00
Document Repository Link Stats:
Mean: 11.82 (± 22.41), Median: 5.00


dataset_source_name,document_repository_link_id,document_id,document_url,repository_id,repository_url,researcher_id,developer_account_id,developer_url,author_name,developer_username,developer_name,developer_email,annotated_label,annotation_notes
str,i64,i64,str,i64,str,i64,i64,str,str,str,str,str,str,str
"""plos""",8104,8089,"""https://doi.org/10.1371/journa…",7806,"""https://github.com/jlbunifor/d…",29577,22388,"""https://github.com/jlbunifor""","""A. E. O. Ferreira""","""jlbunifor""",,,"""""",""""""
"""plos""",8104,8089,"""https://doi.org/10.1371/journa…",7806,"""https://github.com/jlbunifor/d…",29578,22388,"""https://github.com/jlbunifor""","""Jorge L. B. Araújo""","""jlbunifor""",,,"""""",""""""
"""plos""",8104,8089,"""https://doi.org/10.1371/journa…",7806,"""https://github.com/jlbunifor/d…",29579,22388,"""https://github.com/jlbunifor""","""W. P. Ferreira""","""jlbunifor""",,,"""""",""""""
"""plos""",8104,8089,"""https://doi.org/10.1371/journa…",7806,"""https://github.com/jlbunifor/d…",29580,22388,"""https://github.com/jlbunifor""","""J. S. de Sousa""","""jlbunifor""",,,"""""",""""""
"""plos""",8104,8089,"""https://doi.org/10.1371/journa…",7806,"""https://github.com/jlbunifor/d…",29581,22388,"""https://github.com/jlbunifor""","""C. L. N. Oliveira""","""jlbunifor""",,,"""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""softwarex""",163220,157239,"""https://doi.org/10.1016/J.SOFT…",153519,"""https://github.com/sseslab/ele…",26073,168456,"""https://github.com/SSESLab""","""Amanda D. Smith""","""SSESLab""","""Site-Specific Energy Systems L…",,"""""",""""""
"""softwarex""",163220,157239,"""https://doi.org/10.1016/J.SOFT…",153519,"""https://github.com/sseslab/ele…",26073,168457,"""https://github.com/amandadsmit…","""Amanda D. Smith""","""amandadsmith""","""Amanda D. Smith""","""amanda.d.smith@gmail.com""","""""",""""""
"""softwarex""",163220,157239,"""https://doi.org/10.1016/J.SOFT…",153519,"""https://github.com/sseslab/ele…",318162,168455,"""https://github.com/kadenP""","""Kaden Plewe""","""kadenP""",,,"""""",""""""
"""softwarex""",163220,157239,"""https://doi.org/10.1016/J.SOFT…",153519,"""https://github.com/sseslab/ele…",318162,168456,"""https://github.com/SSESLab""","""Kaden Plewe""","""SSESLab""","""Site-Specific Energy Systems L…",,"""""",""""""
