# The Unconquerables of Open Access

## Merge of journals with PubMed references

Project for the EAHIL conference 2023 : https://eahil2023.org/
Authors : **Floriane Muller & Pablo Iriarte**, University of Geneva  
Last update : 03.05.2023

This purpose of this notebook is to import all the publications metadata from PubMed usefull for the study.


### Sources

* Journal information obtained with notebooks 1-6
* PubMed references obtained from the NLM FTP following the information available here: https://www.nlm.nih.gov/databases/download/pubmed_medline.html ).

The whole database is downloadable in form of compressed XML files (medline23n[0001-1166].xml.gz). At the moment of our study (03.05.2023) we obtained 1166 gz files (38Gb in total)

The elements of XML files are described here : https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html

## MEDLINE metadata extraction

* Metadata elements description: https://www.nlm.nih.gov/bsd/licensee/elements_alphabetical.html


### Choice of fields to extract

 * PMID: PubmedArticleSet/PubmedArticle/MedlineCitation/PMID
 * DOI: PubmedArticleSet/PubmedArticle/PubmedData/ArticleIdList/ArticleId@IdType="doi"
 * Publication date: PubmedArticleSet/PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate/Year
 * Journal ID: PubmedArticleSet/PubmedArticle/MedlineCitation/MedlineJournalInfo/NlmUniqueID

### Extract content using XPATH and export in TSV format

In [1]:
import codecs
import glob
from lxml import etree
import pandas as pd
# display the full content of rows (non truncated)
pd.set_option('display.max_colwidth', -1)
# display all the columns
pd.set_option('display.max_columns', None)

In [2]:
# Loop for all the XML files in gz format
myfolderin = 'E:/data_sources/nlm/extractions_xml/pubmed_xml_baseline_20230503/baseline/'

# Loop for all the XML files in gz format
myfolderout = 'data/sources/pubmed/'

# select files to include
# here we limite the extraction to last 72 files (around 2'000'000 citations)
all_files = glob.glob(myfolderin + "pubmed23*.gz")

# loop into the XML files
for file_ in all_files:
    # skip x files in case of error 
    if (int(file_[-11:-7]) > 0):
        print(file_[-20:])

        # create file
        # keep the last 15 last chars of name
        fname = myfolderout + file_[-20:-7] + '.tsv'
        file = codecs.open(fname, 'w', 'utf-8')

        # write first line
        file.write('PMID\tYear\tMedlineDate\tDOI\tNlmUniqueID\n')

        # Parse XML
        root = etree.parse(file_)

        # select the node roots
        citations = root.xpath('/PubmedArticleSet/PubmedArticle')

        for i in range(len(citations)):
            # pmid
            pmid = citations[i].xpath('MedlineCitation/PMID')[0].text

            # joutnal
            journal = ''
            if (citations[i].xpath('MedlineCitation/MedlineJournalInfo/NlmUniqueID')):
                journal = citations[i].xpath('MedlineCitation/MedlineJournalInfo/NlmUniqueID')[0].text

            # year
            year = ''
            if (citations[i].xpath('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')):
                year = citations[i].xpath('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')[0].text

            date = ''
            if (citations[i].xpath('MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate')):
                date = citations[i].xpath('MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate')[0].text

            # doi
            doi = ''
            if (citations[i].xpath('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')):
                doi = citations[i].xpath('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')[0].text
                if doi is None:
                    doi = ''

            # saving data
            file.write(pmid + '\t')
            file.write(year + '\t')
            file.write(date + '\t')
            file.write(doi + '\t')
            file.write(journal + '\t')
            file.write('\n')
        file.close()

pubmed23n0001.xml.gz
pubmed23n0002.xml.gz
pubmed23n0003.xml.gz
pubmed23n0004.xml.gz
pubmed23n0005.xml.gz
pubmed23n0006.xml.gz
pubmed23n0007.xml.gz
pubmed23n0008.xml.gz
pubmed23n0009.xml.gz
pubmed23n0010.xml.gz
pubmed23n0011.xml.gz
pubmed23n0012.xml.gz
pubmed23n0013.xml.gz
pubmed23n0014.xml.gz
pubmed23n0015.xml.gz
pubmed23n0016.xml.gz
pubmed23n0017.xml.gz
pubmed23n0018.xml.gz
pubmed23n0019.xml.gz
pubmed23n0020.xml.gz
pubmed23n0021.xml.gz
pubmed23n0022.xml.gz
pubmed23n0023.xml.gz
pubmed23n0024.xml.gz
pubmed23n0025.xml.gz
pubmed23n0026.xml.gz
pubmed23n0027.xml.gz
pubmed23n0028.xml.gz
pubmed23n0029.xml.gz
pubmed23n0030.xml.gz
pubmed23n0031.xml.gz
pubmed23n0032.xml.gz
pubmed23n0033.xml.gz
pubmed23n0034.xml.gz
pubmed23n0035.xml.gz
pubmed23n0036.xml.gz
pubmed23n0037.xml.gz
pubmed23n0038.xml.gz
pubmed23n0039.xml.gz
pubmed23n0040.xml.gz
pubmed23n0041.xml.gz
pubmed23n0042.xml.gz
pubmed23n0043.xml.gz
pubmed23n0044.xml.gz
pubmed23n0045.xml.gz
pubmed23n0046.xml.gz
pubmed23n0047.xml.gz
pubmed23n0048

pubmed23n0392.xml.gz
pubmed23n0393.xml.gz
pubmed23n0394.xml.gz
pubmed23n0395.xml.gz
pubmed23n0396.xml.gz
pubmed23n0397.xml.gz
pubmed23n0398.xml.gz
pubmed23n0399.xml.gz
pubmed23n0400.xml.gz
pubmed23n0401.xml.gz
pubmed23n0402.xml.gz
pubmed23n0403.xml.gz
pubmed23n0404.xml.gz
pubmed23n0405.xml.gz
pubmed23n0406.xml.gz
pubmed23n0407.xml.gz
pubmed23n0408.xml.gz
pubmed23n0409.xml.gz
pubmed23n0410.xml.gz
pubmed23n0411.xml.gz
pubmed23n0412.xml.gz
pubmed23n0413.xml.gz
pubmed23n0414.xml.gz
pubmed23n0415.xml.gz
pubmed23n0416.xml.gz
pubmed23n0417.xml.gz
pubmed23n0418.xml.gz
pubmed23n0419.xml.gz
pubmed23n0420.xml.gz
pubmed23n0421.xml.gz
pubmed23n0422.xml.gz
pubmed23n0423.xml.gz
pubmed23n0424.xml.gz
pubmed23n0425.xml.gz
pubmed23n0426.xml.gz
pubmed23n0427.xml.gz
pubmed23n0428.xml.gz
pubmed23n0429.xml.gz
pubmed23n0430.xml.gz
pubmed23n0431.xml.gz
pubmed23n0432.xml.gz
pubmed23n0433.xml.gz
pubmed23n0434.xml.gz
pubmed23n0435.xml.gz
pubmed23n0436.xml.gz
pubmed23n0437.xml.gz
pubmed23n0438.xml.gz
pubmed23n0439

pubmed23n0783.xml.gz
pubmed23n0784.xml.gz
pubmed23n0785.xml.gz
pubmed23n0786.xml.gz
pubmed23n0787.xml.gz
pubmed23n0788.xml.gz
pubmed23n0789.xml.gz
pubmed23n0790.xml.gz
pubmed23n0791.xml.gz
pubmed23n0792.xml.gz
pubmed23n0793.xml.gz
pubmed23n0794.xml.gz
pubmed23n0795.xml.gz
pubmed23n0796.xml.gz
pubmed23n0797.xml.gz
pubmed23n0798.xml.gz
pubmed23n0799.xml.gz
pubmed23n0800.xml.gz
pubmed23n0801.xml.gz
pubmed23n0802.xml.gz
pubmed23n0803.xml.gz
pubmed23n0804.xml.gz
pubmed23n0805.xml.gz
pubmed23n0806.xml.gz
pubmed23n0807.xml.gz
pubmed23n0808.xml.gz
pubmed23n0809.xml.gz
pubmed23n0810.xml.gz
pubmed23n0811.xml.gz
pubmed23n0812.xml.gz
pubmed23n0813.xml.gz
pubmed23n0814.xml.gz
pubmed23n0815.xml.gz
pubmed23n0816.xml.gz
pubmed23n0817.xml.gz
pubmed23n0818.xml.gz
pubmed23n0819.xml.gz
pubmed23n0820.xml.gz
pubmed23n0821.xml.gz
pubmed23n0822.xml.gz
pubmed23n0823.xml.gz
pubmed23n0824.xml.gz
pubmed23n0825.xml.gz
pubmed23n0826.xml.gz
pubmed23n0827.xml.gz
pubmed23n0828.xml.gz
pubmed23n0829.xml.gz
pubmed23n0830