# Following the Preprint to Published Path

The goal of this notebook is to map preprint dois to published dois and published dois to Pubmed Central articles.

In [1]:
import json
import re

import numpy as np
import pandas as pd
from ratelimit import limits, sleep_and_retry
import requests
import tqdm
from urllib.error import HTTPError

In [2]:
preprints_df = pd.read_csv(
    "../exploratory_data_analysis/output/biorxiv_article_metadata.tsv", 
    sep="\t"
)
preprints_df.head()

Unnamed: 0,author_type,heading,category,document,doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853


In [3]:
dois = (
    preprints_df
    .doi
    .unique()
)
print(len(dois))

71118


In [4]:
FIVE_MINUTES = 300

@sleep_and_retry
@limits(calls=100, period=FIVE_MINUTES)
def call_biorxiv(doi_ids):
    url = "https://api.biorxiv.org/details/biorxiv/"
    responses = []
    for doi in doi_ids:
        try:
            response = requests.get(url+doi).json()
            responses.append(response)
        except:
            responses.append({
                "message":{
                    "relation":{"none":"none"}, 
                    "DOI":doi
                }
            })
        
    return responses

In [5]:
FIVE_MINUTES = 300

@sleep_and_retry
@limits(calls=300, period=FIVE_MINUTES)
def call_pmc(doi_ids, tool_name, email):
    query = (
        "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?"
        f"ids={','.join(doi_ids)}"
        f"&tool={tool_name}"
        f"&email={email}"
        "&format=json"
    )
    
    return requests.get(query)

# Map preprint DOIs to Published DOIs

In [6]:
batch_limit = 100
doi_mapper_records = []

for batch in tqdm.tqdm(range(0, len(dois), batch_limit)):
    response = call_biorxiv(dois[batch:batch+batch_limit])
    doi_mapper_records += [
        {
            "preprint_doi": collection['doi'],
            "posted_date": collection['date'],
            "published_doi": collection['published'],
            "version": collection['version']
        }
        for result in response
        for collection in result['collection']
    ]


100%|██████████| 712/712 [1:01:58<00:00,  5.22s/it]


In [7]:
(
    pd.DataFrame
    .from_records(doi_mapper_records)
    .to_csv("output/mapped_published_doi_part1.tsv", sep="\t", index=False)
)

# Map Journal Titles to DOI

In [6]:
published_doi_df = pd.read_csv(
    "output/mapped_published_doi_part1.tsv", 
    sep="\t"
)
print(published_doi_df.shape)
published_doi_df.head()

(102494, 4)


Unnamed: 0,preprint_doi,posted_date,published_doi,version
0,10.1101/440735,2018-10-11,,1
1,10.1101/440735,2019-07-02,,2
2,10.1101/440735,2019-11-04,,3
3,10.1101/775270,2019-09-23,10.1016/j.vaccine.2020.06.032,1
4,10.1101/242404,2018-01-04,,1


In [9]:
mapped_preprints_df = (
    preprints_df
    .assign(
        version=lambda x: x.document.apply(lambda doc: int(doc.split(".")[0][-1])),
    )
    .rename(index=str, columns={"doi":"preprint_doi"})
    .merge(
        published_doi_df.assign(
            published_doi=lambda x: x.published_doi.apply(
                lambda url: re.sub(r"http(s)?://doi.org/", '', url) 
                if type(url) == str else url
            )
        ), 
        on=["preprint_doi", "version"]
    )
)
print(mapped_preprints_df.shape)
mapped_preprints_df.head()

(98035, 8)


Unnamed: 0,author_type,heading,category,document,preprint_doi,version,posted_date,published_doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735,1,2018-10-11,
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270,1,2019-09-23,10.1016/j.vaccine.2020.06.032
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404,1,2018-01-04,
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994,1,2019-12-12,10.7554/eLife.54347
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853,2,2017-04-24,10.1242/dev.154971


In [11]:
mapped_preprints_df.to_csv(
    "output/mapped_published_doi_part2.tsv", 
    sep="\t", index=False
)

# Map Published Articles to PMC

In [6]:
preprint_df = pd.read_csv("output/mapped_published_doi_part2.tsv", sep="\t")
print(preprint_df.shape)
preprint_df.head()

(98035, 8)


Unnamed: 0,author_type,heading,category,document,preprint_doi,version,posted_date,published_doi
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735,1,2018-10-11,
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270,1,2019-09-23,10.1016/j.vaccine.2020.06.032
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404,1,2018-01-04,
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994,1,2019-12-12,10.7554/eLife.54347
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853,2,2017-04-24,10.1242/dev.154971


In [7]:
pmc_df = pd.read_csv(
    "../../pmc/exploratory_data_analysis/output/pubmed_central_journal_paper_map.tsv.xz", 
    sep="\t"
)
pmc_df.head()

Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
2,Environ_Health,correction,10.1186/s12940-018-0415-9,PMC6124016
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232


In [8]:
final_df = (
    preprint_df
    .assign(published_doi=preprint_df.published_doi.str.lower())
    .merge(
        pmc_df[["doi", "pmcid"]]
        .assign(doi=pmc_df.doi.str.lower())
        .dropna()
        .rename(index=str, columns={"doi":"published_doi"}), 
        how="left", on="published_doi"
    )
)
print(final_df.shape)
final_df.head()

(98035, 9)


Unnamed: 0,author_type,heading,category,document,preprint_doi,version,posted_date,published_doi,pmcid
0,regular article,new results,genetics,440735_v1.xml,10.1101/440735,1,2018-10-11,,
1,regular article,new results,systems biology,775270_v1.xml,10.1101/775270,1,2019-09-23,10.1016/j.vaccine.2020.06.032,
2,regular article,new results,genetics,242404_v1.xml,10.1101/242404,1,2018-01-04,,
3,regular article,new results,neuroscience,872994_v1.xml,10.1101/2019.12.11.872994,1,2019-12-12,10.7554/elife.54347,
4,regular article,new results,developmental biology,080853_v2.xml,10.1101/080853,2,2017-04-24,10.1242/dev.154971,


In [9]:
# Fill in missing links
missing_ids = (
    final_df
    .query("published_doi.notnull()&pmcid.isnull()")
    .published_doi
    .unique()
)
print(len(missing_ids))

22345


In [10]:
chunksize=100
data = []
for chunk in tqdm.tqdm(range(0, len(missing_ids), chunksize)):
    query_ids = missing_ids[chunk:chunk+chunksize]
    response = call_pmc(query_ids, 'model_name', 'email@server.com').json()
    
    for potential_match in response['records']:
        if "pmcid" not in potential_match:
            continue
        
        data.append({
            "pmcid": potential_match["pmcid"], 
            "published_doi": potential_match['doi']
        })

100%|██████████| 224/224 [01:08<00:00,  3.27it/s]


In [11]:
missing_pmcids = pd.DataFrame.from_records(data)
missing_pmcids.head()

Unnamed: 0,pmcid,published_doi
0,PMC7494356,10.7554/eLife.54347
1,PMC5665486,10.1242/dev.154971
2,PMC7293348,10.1038/s41467-020-16846-w
3,PMC4536314,10.1261/rna.051557.115
4,PMC6328047,10.1016/j.celrep.2018.11.046


In [28]:
(
    final_df
    .merge(
        missing_pmcids.assign(published_doi=lambda x:x.published_doi.str.lower()),
        on="published_doi", how="left"
    )
    .assign(
        final_pmcid=lambda x: x.pmcid_x.fillna('') + x.pmcid_y.fillna(''),
        pmcoa=final_df.pmcid.isin(pmc_df.pmcid.values.tolist())
    )
    .drop(["pmcid_x", "pmcid_y"], axis=1)
    .rename(index=str, columns={"final_pmcid":"pmcid"})
    .replace('', np.nan)
    .to_csv(
        "output/mapped_published_doi.tsv",
        sep="\t", index=False
    )
)