# Grab the bioRxiv subset from Polka et al 2021

In [1]:
from pathlib import Path
import tarfile

import pandas as pd
import requests
import tqdm
import urllib.request as request

In [3]:
manual_papers_df = pd.read_csv(str(Path("output/all_pairs_2021-02-11.csv")))
manual_papers_df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,1,2,3,4,5
line_break_and_label,(((((( Preprint DOI:,(((((( Preprint DOI:,(((((( Preprint DOI:,(((((( Preprint DOI:,(((((( Preprint DOI:
source,medrxiv,medrxiv,medrxiv,medrxiv,medrxiv
doi,10.1101/19005710,10.1101/19006031,10.1101/19006171,10.1101/19006478,10.1101/19006502
abstract_source,crossref,copied,copied,copied,crossref
line_break_version,((((((,((((((,((((((,((((((,((((((
n_versions,1,2,2,2,1
line_break_and_fiducial,(((((( We have manually inserted this sentence...,(((((( We have manually inserted this sentence...,(((((( We have manually inserted this sentence...,(((((( We have manually inserted this sentence...,(((((( We have manually inserted this sentence...
posted_date,########,########,########,########,########
covid_preprint,False,False,False,False,False


In [4]:
manual_papers_df.exclude.unique()

array(['keep', 'exclude',
       'exclude (after annotating, we realized this published paper had no abstract and the abstract was copied from the first paragraph of the full text)',
       'keep (note that we redid the annotations on this one as PubMed had a truncated version of the published abstract)',
       nan], dtype=object)

In [5]:
papers_to_download = manual_papers_df.fillna("").query(
    "exclude.str.contains('keep')&source=='biorxiv'"
)
print(papers_to_download.shape)
papers_to_download.head()

(60, 46)


Unnamed: 0.1,Unnamed: 0,line_break_and_label,source,doi,abstract_source,line_break_version,n_versions,line_break_and_fiducial,posted_date,covid_preprint,...,Highest_change,Raw scores,Sum of minus scores,Sum of plus scores,Diff pos-neg scores,Contains a 2 (binary),Contains a 2,"Either 0, 1 or 2",Conclusions (raw),Results (raw)
53,54,(((((( Preprint DOI:,biorxiv,10.1101/2019.12.13.876110,crossref,((((((,1,(((((( We have manually inserted this sentence...,########,False,...,0,,0,0,0,0,0,0,[],[]
54,55,(((((( Preprint DOI:,biorxiv,10.1101/2019.12.15.877035,crossref,((((((,1,(((((( We have manually inserted this sentence...,########,False,...,0,,0,0,0,0,0,0,[],[]
57,58,(((((( Preprint DOI:,biorxiv,10.1101/2019.12.17.880302,crossref,((((((,1,(((((( We have manually inserted this sentence...,########,False,...,0,1 1 1 1 1+ 1 1 1 1 1+ 1+ 1+ 1 1+ 1+ 1 1 1+ 1 1+,0,20,20,0,0,1,"[['added', '1'], ['nounchange', '1'], ['nounch...","[['statinfo', '1+'], ['added', '1'], ['added',..."
59,60,(((((( Preprint DOI:,biorxiv,10.1101/2019.12.18.881391,crossref,((((((,1,(((((( We have manually inserted this sentence...,########,False,...,1,,0,0,0,0,0,0,[],[]
60,61,(((((( Preprint DOI:,biorxiv,10.1101/2019.12.19.882274,crossref,((((((,1,(((((( We have manually inserted this sentence...,########,False,...,1,1 1 1 1- 1 1 1 1 1+ 1+ 1 1,1,11,10,0,0,1,"[['removed', '1'], ['removed', '1']]","[['nounchange', '1'], ['nounchange', '1'], ['a..."


In [6]:
parsed_files = [
    str(x.stem).split("_")[0]
    for x in list(Path("output/biorxiv_xml_files").rglob("*xml"))
]

In [7]:
published_doi_map = []
for idx, paper in tqdm.tqdm(papers_to_download.iterrows()):
    user_doi = paper["doi"]
    file_name = user_doi.split("/")[-1]

    if file_name in parsed_files:
        continue

    api_url = f"https://api.biorxiv.org/details/biorxiv/{user_doi}"
    response = requests.get(api_url)
    content = response.json()
    latest_paper = content["collection"][-1]
    version_count = len(content["collection"])
    published_doi_map.append(
        {"biorxiv_doi": user_doi, "published_doi": latest_paper["published"]}
    )

    doc_url = "http://biorxiv.org/content"
    file_url = f"{doc_url}/early/{latest_paper['date'].replace('-', '/')}/{file_name}.source.xml"

    response = requests.get(file_url)

    with open(
        f"output/biorxiv_xml_files/{file_name}_v{version_count}.xml", "wb"
    ) as outfile:
        outfile.write(response.content)

60it [00:57,  1.03it/s]


In [8]:
if not Path("output/polka_et_al_mapped_subset.tsv").exists():
    mapped_papers_df = pd.DataFrame.from_records(published_doi_map)
    mapped_papers_df.to_csv(
        "output/polka_et_al_mapped_subset.tsv", sep="\t", index=False
    )
else:
    mapped_papers_df = pd.read_csv("output/polka_et_al_mapped_subset.tsv", sep="\t")
mapped_papers_df.head()

Unnamed: 0,biorxiv_doi,published_doi
0,10.1101/2019.12.13.876110,10.1016/j.celrep.2020.02.018
1,10.1101/2019.12.15.877035,10.1016/j.celrep.2020.02.114
2,10.1101/2019.12.17.880302,10.1021/acsabm.9b01171
3,10.1101/2019.12.18.881391,10.1128/JVI.00426-20
4,10.1101/2019.12.19.882274,10.3389/fpls.2020.00355


# Perform DOI to PM(C)ID Conversion

Copy and paste the list into the text box on this online conversion website: https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/. Download the csv results and continue parsing the file.

In [9]:
for doi in mapped_papers_df.published_doi.tolist():
    print(doi)

10.1016/j.celrep.2020.02.018
10.1016/j.celrep.2020.02.114
10.1021/acsabm.9b01171
10.1128/JVI.00426-20
10.3389/fpls.2020.00355
10.1007/s00300-020-02670-x
10.1182/blood.2019002867
10.1128/AAC.00086-20
10.1038/s41586-020-2012-7
10.1038/s41564-020-0688-y
10.1016/j.ijid.2020.01.050
10.1002/jmv.25688
10.3390/jcm9020498
10.1016/j.meegid.2020.104212
10.1080/22221751.2020.1729069
10.1093/bioinformatics/btaa145
10.1002/jmv.25700
10.3390/v12030254
10.1126/science.abb2507
10.1038/s41586-020-2169-0
10.1016/j.bbrc.2020.03.044
10.3390/v12040428
10.1016/j.cell.2020.02.058
10.1038/s41586-020-2223-y
10.1371/journal.pone.0230295
10.3934/mbe.2020159
10.1002/pro.3873
10.1016/j.apsb.2020.04.009
10.1016/j.jmoldx.2020.03.006
10.1038/s41422-020-0305-x
10.1128/AAC.00440-20
10.1126/science.abb7269
10.1093/ve/veaa027
10.3390/jcm9040982
10.1111/eva.12980
10.7554/eLife.57003
10.15252/embj.2019103365
10.1186/s12864-019-6294-9
10.1016/j.ijpara.2020.01.002
10.1091/mbc.E19-09-0550
10.1093/bioinformatics/btz899
10.1371/

In [11]:
mapped_doi_pmcids = pd.read_csv("output/mapped_doi_to_pmc.csv")
mapped_doi_pmcids.head()

Unnamed: 0,PMID,PMCID,DOI,Version,MID,IsCurrent,IsLive,ReleaseDate,Msg
0,32295925.0,PMC7307142,10.1128/JVI.00426-20,,,,1,,
1,32373138.0,PMC7176908,10.3389/fpls.2020.00355,,,,1,,
2,32128578.0,PMC7243144,10.1182/blood.2019002867,,,,1,,
3,32284379.0,PMC7269492,10.1128/AAC.00086-20,,,,1,,
4,32015507.0,PMC7095418,10.1038/s41586-020-2012-7,,,,1,,


In [12]:
if not Path("output/polka_et_al_pmc_mapped_subset.tsv").exists():
    pmcid_mapped_papers_df = (
        mapped_papers_df.merge(
            mapped_doi_pmcids, left_on="published_doi", right_on="DOI"
        )
        .query("PMCID.notnull()")
        .drop("DOI", axis=1)
    )
    pmcid_mapped_papers_df.to_csv(
        "output/polka_et_al_pmc_mapped_subset.tsv", sep="\t", index=False
    )
else:
    pmcid_mapped_papers_df = pd.read_csv(
        "output/polka_et_al_pmc_mapped_subset.tsv", sep="\t"
    )

print(pmcid_mapped_papers_df.shape)
pmcid_mapped_papers_df.head()

(47, 10)


Unnamed: 0,biorxiv_doi,published_doi,PMID,PMCID,Version,MID,IsCurrent,IsLive,ReleaseDate,Msg
0,10.1101/2019.12.18.881391,10.1128/JVI.00426-20,32295925.0,PMC7307142,,,,1,,
1,10.1101/2019.12.19.882274,10.3389/fpls.2020.00355,32373138.0,PMC7176908,,,,1,,
2,10.1101/2020.01.13.905190,10.1182/blood.2019002867,32128578.0,PMC7243144,,,,1,,
3,10.1101/2020.01.21.914929,10.1128/AAC.00086-20,32284379.0,PMC7269492,,,,1,,
4,10.1101/2020.01.22.914952,10.1038/s41586-020-2012-7,32015507.0,PMC7095418,,,,1,,


# Download Files from PMCOA's FTP server

In [13]:
pmc_open_access_url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/"
response = request.urlopen(f"{pmc_open_access_url}")
files = response.read().decode("utf-8").splitlines()
tar_files = [f.split(" ")[-1] for f in files]
pmcid_list = pmcid_mapped_papers_df.PMCID.tolist()

In [15]:
if not any(Path("output/pmcoa_xml_files").iterdir()):
    # Cycle through each tar file on the server
    for tar_file in tqdm.tqdm(tar_files):

        # If not xml files skip
        if all(suffix != ".xml" for suffix in Path(tar_file).suffixes):
            continue

        # If temp file skip
        if Path(tar_file).suffix == ".tmp":
            continue

        # Grab the file from the tarfile
        print(f"Requesting {pmc_open_access_url}{tar_file}....")
        requested_file_stream = request.urlopen(f"{pmc_open_access_url}{tar_file}")
        open_stream = tarfile.open(fileobj=requested_file_stream, mode="r:gz")

        while True:
            pmc_paper = open_stream.next()

            if pmc_paper is None:
                break

            if pmc_paper.isdir():
                continue

            paper_pathlib = Path("output/pmcoa_xml_files") / Path(pmc_paper.name)
            if paper_pathlib.stem in pmcid_list:

                new_paper = open_stream.extractfile(pmc_paper)
                paper_pathlib.parent.mkdir(exist_ok=True)

                with open(f"{str(paper_pathlib)}", "wb") as outfile:
                    outfile.write(new_paper.read())

  0%|          | 0/16 [00:00<?, ?it/s]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/comm_use.A-B.xml.tar.gz....


 12%|█▎        | 2/16 [07:14<50:38, 217.07s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/comm_use.C-H.xml.tar.gz....


 25%|██▌       | 4/16 [13:21<41:24, 207.01s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/comm_use.I-N.xml.tar.gz....


 38%|███▊      | 6/16 [25:10<41:53, 251.34s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/comm_use.O-Z.xml.tar.gz....


 50%|█████     | 8/16 [40:42<42:05, 315.66s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz....


 62%|██████▎   | 10/16 [42:27<23:40, 236.78s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.C-H.xml.tar.gz....


 75%|███████▌  | 12/16 [44:55<12:31, 187.92s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.I-N.xml.tar.gz....


 88%|████████▊ | 14/16 [49:23<05:43, 171.68s/it]

Requesting ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.O-Z.xml.tar.gz....


100%|██████████| 16/16 [51:34<00:00, 193.39s/it]
