In [1]:

import pandas as pd
from pathlib import Path
import sqlite3

# Dataset of a Study of Computational reproducibility of Jupyter notebooks from biomedical publications
# https://zenodo.org/records/8226725
with sqlite3.connect(
    Path(
        "~/Downloads/computational-reproducibility-pmc/computational-reproducibility-pmc/analyses/db.sqlite"
    ).expanduser()
) as conn:
    df = pd.read_sql("SELECT * FROM ARTICLE", conn)

df.iloc[0]

id                                                                     1
journal_id                                                             1
name                   ElasticBLAST: accelerating sequence search via...
pmid                                                                 NaN
pmc                                                             10040096
publisher_id                                                        5245
doi                                           10.1186/s12859-023-05245-9
subject                                                         Software
published_date                                                2023-03-26
received_date                                                 2023-01-04
accepted_date                                                 2023-03-21
license_type                                                        None
copyright_statement    © This is a U.S. Government work and not under...
keywords               BLAST;Cloud computing;Alignm

In [2]:
# Create a pairing of the doi and the repo
repo_doi_pairs = []

df = df.dropna(subset=["doi", "repositories"])
print(len(df))

for _, row in df.iterrows():
    # Convert repos to list
    repos = row["repositories"].split(";")
    
    # Add each repo to the list
    for repo in repos:
        repo_doi_pairs.append({
            "doi": row["doi"],
            "repo": repo,
        })

repo_doi_pairs = pd.DataFrame(repo_doi_pairs)
repo_doi_pairs.sample(3)

3437


Unnamed: 0,doi,repo
5386,10.3389/fnbot.2019.00065,https://github.com/poppy-project/pypot
6600,10.1099/mgen.0.000071,https://github.com/dmnfarrell/gordon-group
1204,10.1016/j.csbj.2022.08.035,


In [3]:
sample_of_pairs = repo_doi_pairs.sample(50)

from rs_graph.data.enrichment.semantic_scholar import get_extended_paper_details

# Get extended paper details
sample_of_pairs["doi"] = sample_of_pairs.doi.apply(lambda x: f"doi:{x}")
sample_of_pairs_details = get_extended_paper_details(
    sample_of_pairs.doi.tolist(),
    filter_out_nones=False,
)

# Attach paper details to truede as columns
biosciences_notebook_repro = []
for repo, paper_detail in zip(sample_of_pairs.repo, sample_of_pairs_details, strict=True):
    if paper_detail is not None:
        biosciences_notebook_repro.append(
            {
                "corpus_id": paper_detail.corpus_id,
                "doi": paper_detail.doi,
                "title": paper_detail.title,
                "abstract": paper_detail.abstract,
                "repo": repo,
                "tldr": paper_detail.tldr,
                "primary_topic": paper_detail.primary_topic,
                "secondary_topic": paper_detail.secondary_topic,
            }
        )

biosciences_notebook_repro = pd.DataFrame(biosciences_notebook_repro)
biosciences_notebook_repro.sample(3)


  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,corpus_id,doi,title,abstract,repo,tldr,primary_topic,secondary_topic
44,212736837,doi:10.7717/peerj.10316,A Bayesian brain model of adaptive behavior: a...,Adaptive behavior emerges through a dynamic in...,https://github.com/stefanradev93/BayesFlow,This work proposes and validate a new computat...,Biology,Medicine
38,237149049,doi:10.1128/mSystems.00811-21,Separation of Donor and Recipient Microbial Di...,We assumed that the enrichment of successful g...,https://github.com/ivartb/RECAST,The recipient intestine colonization analysis ...,Medicine,Biology
8,236504044,doi:10.3389/fpls.2021.715309,An Affordable Image-Analysis Platform to Accel...,Recent technical advances in the computer-visi...,https://github.com/tensorflow/models,A platform that allows real-time stomata detec...,Medicine,Computer Science


In [4]:
# Create a column for "singular_topic" which is preferred to be "secondary_topic"
# but if that is None then use "primary_topic"
biosciences_notebook_repro["singular_topic"] = (
    biosciences_notebook_repro.secondary_topic.fillna(
        biosciences_notebook_repro.primary_topic,
    )
)

# Drop papers without a singular topic
biosciences_notebook_repro = biosciences_notebook_repro.dropna(subset=["singular_topic"])
biosciences_notebook_repro.singular_topic.value_counts()

Computer Science         16
Biology                  14
Medicine                 11
Chemistry                 2
Environmental Science     2
Geography                 1
Physics                   1
Engineering               1
Name: singular_topic, dtype: int64

In [5]:
# for each repo, get the reademe (if it exists) and the description.
from dotenv import load_dotenv
from ghapi.all import GhApi
from rs_graph.data.enrichment.github import get_repo_parts_from_url
import base64

load_dotenv()

api = GhApi()

def get_repo_readme(repo: str) -> str | None:
    repo_parts = get_repo_parts_from_url(repo)
    try:
        b64_encoded_readme = api.repos.get_readme(
                owner=repo_parts.owner,
                repo=repo_parts.repo,
            )["content"]
        
        return base64.b64decode(b64_encoded_readme).decode("utf-8")
    except:
        return None
    
# Get the READMEs and then then embeddings for each README
from tqdm import tqdm
import time

readmes = []
for repo in tqdm(biosciences_notebook_repro["repo"], total=len(biosciences_notebook_repro)):
    readmes.append(get_repo_readme(repo))
    time.sleep(0.85)

# Attach to papers
biosciences_notebook_repro["readme"] = readmes

# Drop nulls
biosciences_notebook_repro = biosciences_notebook_repro.dropna(subset=["readme"])
biosciences_notebook_repro.shape

100%|██████████| 48/48 [00:53<00:00,  1.11s/it]


(40, 10)

In [6]:
# Get the embeddings for each README
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

readme_embeddings = model.encode(biosciences_notebook_repro["readme"].tolist(), show_progress_bar=True)
readme_embeddings.shape

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

(40, 768)

In [7]:
import numpy as np
# Attach back to dataframe
biosciences_notebook_repro["readme_embedding"] = [np.array(vec) for vec in readme_embeddings.tolist()]
biosciences_notebook_repro.sample(3)

Unnamed: 0,corpus_id,doi,title,abstract,repo,tldr,primary_topic,secondary_topic,singular_topic,readme,readme_embedding
19,46862333,doi:10.1016/j.cels.2017.11.014,Enhancing Evolutionary Couplings with Deep Con...,,https://github.com/soedinglab/CCMpred,"DeepContact is introduced, a convolutional neu...",Medicine,Computer Science,Computer Science,# CCMpred\n\n[![Travis](https://img.shields.io...,"[-0.00813202653080225, -0.02442438155412674, -..."
33,231149379,doi:10.3389/fbioe.2020.612832,Multi-Omics Driven Metabolic Network Reconstru...,An oleaginous yeast Rhodosporidium toruloides ...,https://github.com/stcoradetti/RBseq,An oleaginous yeast Rhodosporidium toruloides ...,Medicine,Environmental Science,Environmental Science,# RBseq\nTrack fitness of deletion mutants wit...,"[0.0118949543684721, 0.007754592224955559, 0.0..."
6,235718678,doi:10.1186/s13321-021-00525-z,Nonadditivity in public and inhouse data: impl...,,https://github.com/KramerChristian/Nonadditivi...,Nonadditivity (NA) analysis should be applied ...,Computer Science,Medicine,Medicine,# Nonadditivity analysis\n\n[![Code style: bla...,"[0.02291882038116455, 0.0009123127674683928, -..."


In [8]:
# Get the embeddings for each paper abstract
# or if the abstract is None, use the title
paper_texts = (
    biosciences_notebook_repro["abstract"].fillna(
        biosciences_notebook_repro["tldr"]
    ).fillna(biosciences_notebook_repro["title"]).tolist()
)

# Get the embeddings for each paper abstract
paper_embeddings = model.encode(paper_texts, show_progress_bar=True)

# Attach back to dataframe
biosciences_notebook_repro["paper_embedding"] = [np.array(vec) for vec in paper_embeddings.tolist()]
biosciences_notebook_repro.sample(3)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,corpus_id,doi,title,abstract,repo,tldr,primary_topic,secondary_topic,singular_topic,readme,readme_embedding,paper_embedding
21,245916661,doi:10.1016/j.isci.2022.104769,Practical and thermodynamic constraints on ele...,,https://github.com/barstowlab/electroco2,It is demonstrated that electromicrobial produ...,Medicine,Biology,Biology,# electroCO2\n\nBuz Barstow\n2021-12-26\n\nCod...,"[0.04969074949622154, -0.002880058716982603, -...","[0.05940726771950722, 0.021711746230721474, -0..."
6,235718678,doi:10.1186/s13321-021-00525-z,Nonadditivity in public and inhouse data: impl...,,https://github.com/KramerChristian/Nonadditivi...,Nonadditivity (NA) analysis should be applied ...,Computer Science,Medicine,Medicine,# Nonadditivity analysis\n\n[![Code style: bla...,"[0.02291882038116455, 0.0009123127674683928, -...","[-0.04822631925344467, 0.057587385177612305, -..."
9,248442073,doi:10.3390/life12050648,The R Language: An Engine for Bioinformatics a...,The R programming language is approaching its ...,https://github.com/jhallen/joes-sandbox,An historical chronicle of how R became what i...,Medicine,Computer Science,Computer Science,## Joe Allen's Open Source Projects\n\n### Pro...,"[0.018987294286489487, 0.018373312428593636, -...","[-0.029452510178089142, -0.0029465905390679836..."


In [9]:
from skops import io as skio

# Load the model
model = skio.load("repo-paper-matching-model.skops", trusted=True)

# Create interaction embeddings and train logistic regression CV
print("Creating interaction embeddings")
biosciences_notebook_repro["interaction_embedding"] = biosciences_notebook_repro.apply(
    lambda row: row.paper_embedding * row.readme_embedding,
    axis=1,
)

print("Predicting")
biosciences_notebook_repro["prediction"] = model.predict(
    biosciences_notebook_repro["interaction_embedding"].tolist()
)

biosciences_notebook_repro.sample(3)

Creating interaction embeddings
Predicting


Unnamed: 0,corpus_id,doi,title,abstract,repo,tldr,primary_topic,secondary_topic,singular_topic,readme,readme_embedding,paper_embedding,interaction_embedding,prediction
45,220305302,doi:10.1016/j.devcel.2022.01.008,A single cell Arabidopsis root atlas reveals d...,,https://github.com/ohlerlab/COPILOT,The results suggest the existence of an altern...,Biology,Medicine,Medicine,# COPILOT (Cell preprOcessing PIpeline kaLlist...,"[-0.06168226897716522, -0.04459858685731888, -...","[0.011394570581614971, -0.04304368793964386, 0...","[-0.0007028429674944686, 0.0019196876552355358...",match
30,235308894,doi:10.1038/s42003-021-02153-7,Transcription factor enrichment analysis (TFEA...,,https://github.com/Dowell-Lab/mumerge,Transcription factor enrichment analysis (TFEA...,Medicine,Computer Science,Computer Science,﻿# muMerge\r\n\r\nmuMerge is a tool for combin...,"[-0.03200733661651611, -0.020369168370962143, ...","[-0.014159103855490685, -0.0010391349205747247...","[0.00045319520329090146, 2.116631415733294e-05...",no_match
26,208567113,doi:10.1093/bioinformatics/btaa624,Style transfer with variational autoencoders i...,The transcriptomic data is being frequently us...,https://github.com/jdrudolph/goenrich,The proposed solution is based on Variational ...,Medicine,Computer Science,Computer Science,goenrich\n========\n\n.. image:: https://badge...,"[-0.03388982266187668, 0.03242969140410423, -0...","[-0.03258673474192619, 0.132096529006958, -0.0...","[0.0011043586615334944, 0.0042838496712489516,...",no_match


In [10]:
# Print dataframe of matches
for _, row in biosciences_notebook_repro[biosciences_notebook_repro.prediction == "match"].sample(5).iterrows():
    print(row.repo)
    print(row.title)
    print(row.doi)
    print()

https://github.com/zxing/zxing
Performant barcode decoding for herbarium specimen images using vector‐assisted region proposals (VARP)
doi:10.1002/aps3.11436

https://github.com/stcoradetti/RBseq
Multi-Omics Driven Metabolic Network Reconstruction and Analysis of Lignocellulosic Carbon Utilization in Rhodosporidium toruloides
doi:10.3389/fbioe.2020.612832

https://github.com/soedinglab/CCMpred
Enhancing Evolutionary Couplings with Deep Convolutional Neural Networks
doi:10.1016/j.cels.2017.11.014

https://github.com/r3fang/SnapATAC
Single nucleus multi-omics identifies human cortical cell regulatory genome diversity
doi:10.1016/j.xgen.2022.100107

https://github.com/collaborativebioinformatics/cov2db
The third international hackathon for applying insights into large-scale genomic composition to use cases in a wide range of organisms
doi:10.12688/f1000research.110194.1



In [11]:
# Print dataframe of matches
for _, row in biosciences_notebook_repro[biosciences_notebook_repro.prediction == "no_match"].sample(5).iterrows():
    print(row.repo)
    print(row.title)
    print(row.doi)
    print()

https://github.com/owlcs/owlapi
Semantic similarity and machine learning with ontologies
doi:10.1093/bib/bbaa199

https://github.com/BlueBrain/BlueBrainGraph
A Machine-Generated View of the Role of Blood Glucose Levels in the Severity of COVID-19
doi:10.3389/fpubh.2021.695139

https://github.com/information-artifact-ontology/IAO
Community standards for open cell migration data
doi:10.1093/gigascience/giaa041

https://github.com/duderstadt-lab/Born-to-slide
Mobile origin-licensing factors confer resistance to conflicts with RNA polymerase
doi:10.1016/j.celrep.2022.110531

https://github.com/Dowell-Lab/mumerge
Transcription factor enrichment analysis (TFEA) quantifies the activity of multiple transcription factors from a single experiment
doi:10.1038/s42003-021-02153-7

