# Convert biorxiv and medrxiv into Pubtator's bioC format

Convert biorxiv and medrxiv preprints into Pubtator's BioCXML format, so Pubtator's taggers can work.
This notebook only uses the most current version of a preprint.

In [1]:
from pathlib import Path

import lxml.etree as ET
import pandas as pd
import plydata as ply
import plydata.tidy as ply_tdy
import tqdm

from biovectors_modules.preprint_converter import convert_to_bioc

In [2]:
biorxiv_folder = Path("output/biorxiv_medrxiv_dump/biorxiv")
medrxiv_folder = Path("output/biorxiv_medrxiv_dump/medrxiv")

## Get the time stamps of all medrxiv and biorxiv preprints

In [3]:
if not Path("output/all_medrxiv_biorxiv_timestamps.tsv").exists():
    data_rows = []
    xml_parser = ET.XMLParser(encoding="UTF-8", recover=True)
    all_biomed_preprints = list(biorxiv_folder.rglob("*xml")) + list(
        medrxiv_folder.rglob("*xml")
    )
    for xml_file in tqdm.tqdm(all_biomed_preprints):
        doc_obj = ET.parse(str(xml_file), parser=xml_parser).getroot()
        date_row = doc_obj.xpath("//date[@date-type='accepted']/*/text()")
        attribute_label = "accepted"
        if len(date_row) == 0:
            date_row = doc_obj.xpath("//date[@date-type='received']/*/text()")
            attribute_label = "received"
        # In case there is no date found
        if len(date_row) == 0:
            doc_date = "NA"
        else:
            doc_date = "-".join([date_row[1], date_row[0], date_row[2]])
        data_rows.append(
            {
                "date": doc_date,
                "repository": xml_file.parents[0].name,
                "doc_id": xml_file.stem,
                "attribute": attribute_label,
            }
        )

In [4]:
if not Path("output/all_medrxiv_biorxiv_timestamps.tsv").exists():
    preprint_dates_df = (
        pd.DataFrame.from_records(data_rows)
        >> ply_tdy.separate("date", into=["month", "day", "year"])
        >> ply_tdy.separate("doc_id", into=["doc_id", "version"])
    )
    preprint_dates_df >> ply.call(
        ".to_csv", "output/all_medrxiv_biorxiv_timestamps.tsv", sep="\t", index=False
    )
else:
    preprint_dates_df = pd.read_csv(
        "output/all_medrxiv_biorxiv_timestamps.tsv", sep="\t", keep_default_na=False
    )
preprint_dates_df >> ply.slice_rows(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,month,day,year,repository,doc_id,version,attribute
0,12,13,2021,biorxiv,472104,v1,accepted
1,4,4,2021,biorxiv,438107,v1,accepted
2,7,5,2021,biorxiv,435952,v1,accepted
3,4,22,2021,biorxiv,440735,v1,accepted
4,9,19,2019,biorxiv,775270,v1,accepted
5,8,10,2020,biorxiv,242404,v1,accepted
6,7,11,2020,biorxiv,872994,v1,accepted
7,4,24,2017,biorxiv,80853,v2,accepted
8,10,21,2021,biorxiv,418780,v4,accepted
9,12,16,2021,biorxiv,472819,v1,accepted


## Filter each preprint to the latest version

In [5]:
preprint_dates_df.sort_values("year").year.value_counts()

2020    69737
2021    67394
2019    36984
2018    13029
2022    10204
2017     7673
2016     3248
2015     1392
2014      647
2013       73
Name: year, dtype: int64

In [6]:
latest_version_df = (
    (
        preprint_dates_df
        >> ply.arrange("doc_id", "version")
        >> ply.call(".groupby", "doc_id")
    ).agg(
        {
            "version": "last",
            "month": "last",
            "day": "last",
            "year": "last",
            "repository": "last",
        }
    )
    >> ply.call(".reset_index")
    >> ply.define(
        doc_id=lambda x: x.doc_id.apply(
            lambda y: "%06d" % (int(y)) if type(y) == int else y
        )
    )
    >> ply_tdy.unite("doc_id_version", "doc_id", "version", sep="_")
    >> ply.rename(doc_id="doc_id_version")
)
latest_version_df >> ply.slice_rows(10)

Unnamed: 0,doc_id,month,day,year,repository
0,000042_v2,11,7,2013,biorxiv
1,000067_v1,10,19,2013,biorxiv
2,000109_v1,7,2,2020,biorxiv
3,000133_v1,8,11,2020,biorxiv
4,000141_v1,3,22,2020,biorxiv
5,000174_v1,3,20,2020,biorxiv
6,000240_v2,9,1,2020,biorxiv
7,000257_v4,10,16,2014,biorxiv
8,000265_v1,3,20,2020,biorxiv
9,000299_v1,3,20,2020,biorxiv


## Perform the actual conversion to BioCXML format

In [7]:
filter_tag_list = [
    "sc",
    "italic",
    "sub",
    "inline-formula",
    "disp-formula",
    "bold",
    "tr",
    "td",
]

In [8]:
for doc, year, repository in tqdm.tqdm(
    latest_version_df >> ply.pull(["doc_id", "year", "repository"])
):

    doc_folder = medrxiv_folder if repository == "medrxiv" else biorxiv_folder

    parser = ET.XMLParser(encoding="UTF-8", recover=True, remove_blank_text=True)
    tree = ET.parse(open(f"{doc_folder}/{doc}.xml", "rb"), parser=parser)
    ET.strip_tags(tree, *filter_tag_list)

    converted_tree = convert_to_bioc(tree, repository=repository)
    output_folder = Path(f"output/converted_docs/{year}")
    output_folder.mkdir(exist_ok=True, parents=True)
    ET.ElementTree(converted_tree).write(
        f"{output_folder}/{doc}.{doc_folder.stem}.bioc.xml",
        pretty_print=True,
        method="c14n",
    )

100%|██████████| 168038/168038 [11:59<00:00, 233.50it/s]
