In [1]:
from datetime import datetime

import polars as pl
from rs_graph.db.constants import PROD_DATABASE_FILEPATH

def _read_table(table: str) -> pl.DataFrame:
    return pl.read_database_uri(
        f"SELECT * FROM {table}", f"sqlite:///{PROD_DATABASE_FILEPATH}"
    )


def load_collapsed_hydrated_rs_graph_core() -> pl.DataFrame:
    """Load the positive examples for the binary article-repository dataset."""
    # Read all the tables we need
    docs = _read_table("document")
    repos = _read_table("repository")
    pairs = _read_table("document_repository_link")
    authors = _read_table("document_contributor")
    doc_abstracts = _read_table("document_abstract")
    repo_readmes = _read_table("repository_readme")
    topics = _read_table("topic")
    doc_topics = _read_table("document_topic")
    dataset_sources = _read_table("dataset_source")
    repo_contributors = _read_table("repository_contributor")

    # Drop to unique doc and unique repo in pairs
    pairs = pairs.unique(
        subset="document_id",
        keep="none",
    ).unique(
        subset="repository_id",
        keep="none",
    )

    # Hydrate topics
    hydrated_doc_topics = (
        doc_topics.select(
            "document_id",
            "topic_id",
            "score",
        )
        .join(
            topics.select(
                pl.col("id").alias("topic_id"),
                pl.col("name").alias("topic_name"),
                pl.col("subfield_name").alias("topic_subfield_name"),
                pl.col("field_name").alias("topic_field_name"),
                pl.col("domain_name").alias("topic_domain_name"),
            ),
            on="topic_id",
            how="left",
        )
        .filter(pl.col("document_id").is_in(pairs["document_id"].to_list()))
        .select(
            "document_id",
            "topic_name",
            "topic_subfield_name",
            "topic_field_name",
            "topic_domain_name",
            pl.col("score").alias("topic_score"),
        )
        .sort(
            "topic_score",
            descending=True,
        )
    )

    # Create table of count of authors per document
    author_counts = authors.group_by("document_id").len()
    contributor_counts = repo_contributors.group_by("repository_id").len()

    # Hydrate documents
    hydrated_docs = (
        docs.select(
            pl.col("id").alias("document_id"),
            "doi",
            "title",
            "publication_date",
            "cited_by_count",
            "fwci",
            "document_type",
            "is_open_access",
        )
        .join(
            doc_abstracts.select(
                "document_id",
                pl.col("content").alias("document_abstract_content"),
            ),
            on="document_id",
            how="left",
        )
        .join(
            author_counts.select(
                "document_id",
                pl.col("len").alias("document_author_count"),
            ),
            on="document_id",
            how="left",
        )
        .select(
            "document_id",
            pl.col("doi").alias("document_doi"),
            pl.col("title").alias("document_title"),
            pl.col("publication_date").alias("document_publication_date"),
            "document_author_count",
            pl.col("cited_by_count").alias("document_cited_by_count"),
            pl.col("fwci").alias("document_fwci"),
            "document_type",
            pl.col("is_open_access").alias("document_is_open_access"),
            "document_abstract_content",
        )
    )

    # Hydrate repos
    hydrated_repos = (
        repos.select(
            pl.col("id").alias("repository_id"),
            pl.col("owner").alias("repository_owner"),
            pl.col("name").alias("repository_name"),
            pl.col("description").alias("repository_description"),
            pl.col("is_fork").alias("repository_is_fork"),
            pl.col("forks_count").alias("repository_forks_count"),
            pl.col("stargazers_count").alias("repository_stargazers_count"),
            pl.col("open_issues_count").alias("repository_open_issues_count"),
            pl.col("size_kb").alias("repository_size_kb"),
            pl.col("creation_datetime").alias("repository_creation_datetime"),
            pl.col("last_pushed_datetime").alias("repository_last_pushed_datetime"),
        )
        .join(
            repo_readmes.select(
                "repository_id",
                pl.col("content").alias("repository_readme_content"),
            ),
            on="repository_id",
            how="left",
        )
        .join(
            contributor_counts.select(
                "repository_id",
                pl.col("len").alias("repository_contributor_count"),
            ),
            on="repository_id",
            how="left",
        )
        .select(
            "repository_id",
            "repository_owner",
            "repository_name",
            "repository_description",
            "repository_readme_content",
            "repository_contributor_count",
            "repository_is_fork",
            "repository_forks_count",
            "repository_stargazers_count",
            "repository_open_issues_count",
            "repository_size_kb",
            pl.col("repository_creation_datetime").dt.cast_time_unit("us"),
            pl.col("repository_last_pushed_datetime").dt.cast_time_unit("us"),
        )
    )

    # Join docs and repos to pairs
    hydrated_pairs = (
        pairs.select(
            "document_id",
            "repository_id",
            "dataset_source_id",
        )
        .join(
            hydrated_docs,
            on="document_id",
            how="left",
        )
        .join(
            hydrated_repos,
            on="repository_id",
            how="left",
        )
        .join(
            dataset_sources.select(
                pl.col("id").alias("dataset_source_id"),
                pl.col("name").alias("dataset_source_name"),
            ),
            on="dataset_source_id",
            how="left",
        )
        .drop("dataset_source_id")
    )

    # Take top domain, field, subfield, and topic for each document
    hydrated_pairs = hydrated_pairs.join(
        hydrated_doc_topics.group_by("document_id", maintain_order=True).agg(
            pl.col("topic_domain_name").first().alias("document_primary_domain"),
            pl.col("topic_field_name").first().alias("document_primary_field"),
            pl.col("topic_subfield_name").first().alias("document_primary_subfield"),
            pl.col("topic_name").first().alias("document_primary_topic"),
            pl.col("topic_score").first().alias("document_primary_topic_score"),
        ),
        on="document_id",
        how="left",
    )

    # Add in url columns
    hydrated_pairs = hydrated_pairs.with_columns(
        (
            pl.lit("https://doi.org/") + pl.col("document_doi")
        ).alias("document_url"),
        (
            pl.lit("https://github.com/")
            + pl.col("repository_owner")
            + pl.lit("/")
            + pl.col("repository_name")
        ).alias("repository_url"),
    )

    # Add in and move around some columns
    primary_columns = [
        "document_id",
        "document_doi",
        "document_url",
        "repository_id",
        "repository_owner",
        "repository_name",
        "repository_url",
    ]
    secondary_columns = [
        *[
            col for col in hydrated_pairs.columns
            if col not in primary_columns and "document" in col
        ],
        *[
            col for col in hydrated_pairs.columns
            if col not in primary_columns and "repository" in col
        ],
    ]
    
    hydrated_pairs = hydrated_pairs.select(
        *primary_columns,
        *secondary_columns,
    )

    return hydrated_pairs

In [2]:
rs_graph_df = load_collapsed_hydrated_rs_graph_core()
rs_graph_df

document_id,document_doi,document_url,repository_id,repository_owner,repository_name,repository_url,document_title,document_publication_date,document_author_count,document_cited_by_count,document_fwci,document_type,document_is_open_access,document_abstract_content,document_primary_domain,document_primary_field,document_primary_subfield,document_primary_topic,document_primary_topic_score,repository_description,repository_readme_content,repository_contributor_count,repository_is_fork,repository_forks_count,repository_stargazers_count,repository_open_issues_count,repository_size_kb,repository_creation_datetime,repository_last_pushed_datetime
i64,str,str,i64,str,str,str,str,date,u32,i64,f64,str,bool,str,str,str,str,str,f64,str,str,u32,bool,i64,i64,i64,i64,datetime[μs],datetime[μs]
1,"""10.1111/biom.13547""","""https://doi.org/10.1111/biom.1…",1,"""simonbussy""","""binacox""","""https://github.com/simonbussy/…","""Binacox: automatic cut‐point d…",2021-08-18,4,0,0.0,"""article""",true,"""We introduce binacox, a progno…","""Health Sciences""","""Medicine""","""Pathology and Forensic Medicin…","""Molecular Characterization of …",0.9964,"""Automatic cut-points detection…","""# Binacox _binacox_ is a high-…",1,false,2,4,14,15136,2018-01-11 00:42:18,2022-12-08 11:34:52
2,"""10.48550/arxiv.2111.14683""","""https://doi.org/10.48550/arxiv…",2,"""arcelikacikkaynak""","""federated_learning""","""https://github.com/arcelikacik…","""Anomaly Localization in Model …",2021-01-01,1,1,,"""preprint""",true,"""Inserting a backdoor into the …","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Adversarial Robustness in Deep…",0.9953,"""Anomaly Localization in Model …","""# Federated_Learning This rep…",1,false,0,0,0,22,2021-11-29 15:22:38,2021-11-30 05:09:57
3,"""10.18653/v1/2020.findings-emnl…","""https://doi.org/10.18653/v1/20…",3,"""tzshi""","""squall""","""https://github.com/tzshi/squal…","""On the Potential of Lexico-log…",2020-01-01,5,34,2.795,"""article""",true,"""Large-scale semantic parsing d…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Statistical Machine Translatio…",0.9997,"""Data and Code Release for ""On …","""# Data and Code Release for ""[…",1,false,71,52,0,15496,2020-10-20 14:48:55,2020-11-09 20:59:12
7,"""10.1609/aaai.v33i01.33018666""","""https://doi.org/10.1609/aaai.v…",7,"""ihpdep""","""ldes""","""https://github.com/ihpdep/ldes""","""Robust Estimation of Similarit…",2019-07-17,6,73,4.176,"""article""",true,"""Most of existing correlation f…","""Physical Sciences""","""Computer Science""","""Computer Vision and Pattern Re…","""Visual Object Tracking and Per…",0.9998,"""[AAAI19] Robust Estimation of …","""# Robust Estimation of Similar…",1,false,24,108,10,6694,2018-11-05 18:41:32,2020-04-13 04:01:08
8,"""10.48550/arxiv.2101.08393""","""https://doi.org/10.48550/arxiv…",8,"""google""","""pwlfit""","""https://github.com/google/pwlf…","""Distilling Interpretable Model…",2021-01-01,8,1,,"""preprint""",true,"""The goal of model distillation…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Explainable Artificial Intelli…",0.988,,"""# About PWLFit is a small libr…",7,false,8,33,0,1738,2020-01-31 02:25:37,2024-08-02 19:42:36
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
157303,"""10.1016/J.SOFTX.2018.07.007""","""https://doi.org/10.1016/J.SOFT…",153581,"""eclipse""","""xacc""","""https://github.com/eclipse/xac…","""A language and hardware indepe…",2018-01-01,6,40,3.477,"""article""",true,"""Heterogeneous high-performance…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Quantum Computing and Simulati…",1.0,"""XACC - eXtreme-scale Accelerat…","""![XACC](docs/assets/xacc_full_…",21,false,86,167,77,50935,2017-09-19 15:56:59,2024-08-13 13:47:44
157304,"""10.1016/J.SOFTX.2018.10.001""","""https://doi.org/10.1016/J.SOFT…",153582,"""edithcowan""","""sleepmonitor""","""https://github.com/edithcowan/…","""Sleep monitor: A tool for moni…",2018-01-01,5,12,0.32,"""article""",true,"""We present a software package …","""Physical Sciences""","""Engineering""","""Biomedical Engineering""","""Non-contact Physiological Moni…",0.9975,"""System for logging and analysi…","""# sleepMonitor System for logg…",1,false,1,0,0,1450202,2018-04-16 05:12:34,2018-09-21 06:40:23
157305,"""10.1016/J.SOFTX.2018.05.003""","""https://doi.org/10.1016/J.SOFT…",153584,"""dkazanc""","""tomophantom""","""https://github.com/dkazanc/tom…","""TomoPhantom, a software packag…",2018-01-01,5,40,3.102,"""article""",true,"""In the field of computerized t…","""Health Sciences""","""Medicine""","""Radiology, Nuclear Medicine an…","""Positron Emission Tomography I…",1.0,"""Software to generate 2D/3D/4D …","""<table>  <tr>  <td> …",6,false,53,116,7,17897,2017-07-01 22:43:09,2024-08-12 16:22:25
157306,"""10.1016/J.SOFTX.2018.07.003""","""https://doi.org/10.1016/J.SOFT…",153585,"""elseviersoftwarex""","""softx_2018_53""","""https://github.com/elseviersof…","""WaveMaker: The three-dimension…",2018-01-01,3,12,0.762,"""article""",true,"""<h2>Abstract</h2> <i>WaveMaker…","""Physical Sciences""","""Engineering""","""Computational Mechanics""","""Dynamics and Stability of Thin…",1.0,"""WaveMaker: The three-dimension…","""Tested Matlab versions: Matla…",1,false,2,3,0,159,2018-07-27 07:56:31,2018-07-27 08:08:50


In [3]:
# Save this out as the "between sample"
rs_graph_df.write_parquet("rs-graph-v1-all-one-to-one-hydrated-pairs.parquet")

In [4]:
# Create the "within sample
# This is the subset of projects which have
# repositories created before 2022 and a last pushed date of 2023 or later
ai_spanning_rs_graph = rs_graph_df.filter(
    pl.col("repository_creation_datetime") < datetime(2022, 1, 1),
    pl.col("repository_last_pushed_datetime") >= datetime(2023, 1, 1),
)
ai_spanning_rs_graph

document_id,document_doi,document_url,repository_id,repository_owner,repository_name,repository_url,document_title,document_publication_date,document_author_count,document_cited_by_count,document_fwci,document_type,document_is_open_access,document_abstract_content,document_primary_domain,document_primary_field,document_primary_subfield,document_primary_topic,document_primary_topic_score,repository_description,repository_readme_content,repository_contributor_count,repository_is_fork,repository_forks_count,repository_stargazers_count,repository_open_issues_count,repository_size_kb,repository_creation_datetime,repository_last_pushed_datetime
i64,str,str,i64,str,str,str,str,date,u32,i64,f64,str,bool,str,str,str,str,str,f64,str,str,u32,bool,i64,i64,i64,i64,datetime[μs],datetime[μs]
8,"""10.48550/arxiv.2101.08393""","""https://doi.org/10.48550/arxiv…",8,"""google""","""pwlfit""","""https://github.com/google/pwlf…","""Distilling Interpretable Model…",2021-01-01,8,1,,"""preprint""",true,"""The goal of model distillation…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Explainable Artificial Intelli…",0.988,,"""# About PWLFit is a small libr…",7,false,8,33,0,1738,2020-01-31 02:25:37,2024-08-02 19:42:36
9,"""10.48550/arxiv.2106.13703""","""https://doi.org/10.48550/arxiv…",9,"""irom-lab""","""task_relevant_ood_detection""","""https://github.com/irom-lab/ta…","""Task-Driven Detection of Distr…",2021-01-01,4,1,,"""preprint""",true,"""Our goal is to perform out-of-…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Active Learning in Machine Lea…",0.9866,"""Code for paper titled: ""Task-D…","""# Task-Driven Out-of-Distribut…",2,false,1,1,0,11522,2021-06-21 17:46:35,2023-06-17 04:29:49
17,"""10.1109/ICCV.2019.00768""","""https://doi.org/10.1109/ICCV.2…",17,"""thunguyenphuoc""","""hologan""","""https://github.com/thunguyenph…","""HoloGAN: Unsupervised Learning…",2019-10-01,5,245,,"""preprint""",true,"""We propose a novel generative …","""Physical Sciences""","""Engineering""","""Computational Mechanics""","""Analysis of Three-Dimensional …",0.9983,"""HoloGAN""","""# HoloGAN Code release for Hol…",1,false,64,331,13,32125,2019-07-31 11:34:43,2024-06-27 05:38:14
23,"""10.48550/arxiv.1806.08756""","""https://doi.org/10.48550/arxiv…",23,"""robotlocomotion""","""pytorch-dense-correspondence""","""https://github.com/robotlocomo…","""Dense Object Nets: Learning De…",2018-01-01,3,107,,"""preprint""",true,"""What is the right object repre…","""Physical Sciences""","""Engineering""","""Control and Systems Engineerin…","""Robotic Grasping and Learning …",0.9999,"""Code for ""Dense Object Nets: …","""### Updates - September 4, 2…",2,false,133,557,11,76751,2018-02-13 16:41:18,2023-05-09 09:22:39
26,"""10.3847/1538-4365/ab620b""","""https://doi.org/10.3847/1538-4…",26,"""guo-jian-wang""","""refann""","""https://github.com/guo-jian-wa…","""Reconstructing Functions and E…",2020-01-01,4,51,3.763,"""article""",true,"""Abstract In this work, we prop…","""Physical Sciences""","""Physics and Astronomy""","""Astronomy and Astrophysics""","""Gamma-Ray Bursts and Supernova…",0.9998,"""A nonlinear interpolating tool…","""ReFANN ====== **ReFANN (Recon…",1,false,8,17,1,303,2020-04-18 01:14:31,2024-07-25 10:59:23
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
157293,"""10.1016/J.SOFTX.2018.09.007""","""https://doi.org/10.1016/J.SOFT…",153571,"""djsilva99""","""heatrapy""","""https://github.com/djsilva99/h…","""Heatrapy: A flexible Python fr…",2018-01-01,3,18,0.683,"""article""",true,"""Although the number of computa…","""Physical Sciences""","""Materials Science""","""Materials Chemistry""","""Nanoscale Thermal Transport in…",0.9965,"""Python library for simulating …","""# heatrapy [![pypi](https://b…",3,false,14,72,6,7123,2017-07-03 11:00:33,2024-09-24 22:56:17
157294,"""10.1016/J.SOFTX.2018.10.002""","""https://doi.org/10.1016/J.SOFT…",153572,"""mattiamontanari""","""opengjk""","""https://github.com/mattiamonta…","""OpenGJK for C, C# and Matlab: …",2018-01-01,2,19,0.213,"""article""",true,"""Implementing a reliable algori…","""Physical Sciences""","""Computer Science""","""Computer Vision and Pattern Re…","""Sampling-Based Motion Planning…",0.9995,"""Fast and reliable implementati…","""<!-- __…",8,false,37,138,2,439,2018-09-01 07:56:17,2023-12-28 12:51:32
157296,"""10.1016/J.SOFTX.2018.09.005""","""https://doi.org/10.1016/J.SOFT…",153574,"""eitcom""","""pyeit""","""https://github.com/eitcom/pyei…","""pyEIT: A python based framewor…",2018-01-01,10,55,1.093,"""article""",true,"""We present a Python-based, ope…","""Physical Sciences""","""Engineering""","""Electrical and Electronic Engi…","""Electrical Tomography Techniqu…",0.9999,"""Python based toolkit for Elect…","""# ![pyeit](https://raw.githubu…",10,false,100,179,15,1042,2016-01-26 04:53:22,2024-03-29 21:32:25
157303,"""10.1016/J.SOFTX.2018.07.007""","""https://doi.org/10.1016/J.SOFT…",153581,"""eclipse""","""xacc""","""https://github.com/eclipse/xac…","""A language and hardware indepe…",2018-01-01,6,40,3.477,"""article""",true,"""Heterogeneous high-performance…","""Physical Sciences""","""Computer Science""","""Artificial Intelligence""","""Quantum Computing and Simulati…",1.0,"""XACC - eXtreme-scale Accelerat…","""![XACC](docs/assets/xacc_full_…",21,false,86,167,77,50935,2017-09-19 15:56:59,2024-08-13 13:47:44


In [5]:
ai_spanning_rs_graph.write_parquet("rs-graph-v1-all-ai-spanning-one-to-one-hydrated-pairs.parquet")