In [1]:
import os
import gzip
import xml.etree

import networkx
import pandas

## Information on MeSH xml Files

+ http://www.nlm.nih.gov/mesh/xmlmesh.html
+ http://www.nlm.nih.gov/mesh/xml_data_elements.html

In [2]:
# Download xml file
capture = !wget --timestamping --directory-prefix download/ ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/supp2015.gz

In [3]:
# Parse xml file as an ElementTree
xml_path = os.path.join('download', 'supp2015.gz')
with gzip.open(xml_path) as xml_file:
    tree = xml.etree.ElementTree.parse(xml_file)
root = tree.getroot()

In [4]:
# Extract records and save as tsv
record_dicts = list()

for record in root:
    record_dict = dict()
    record_dict['SCRClass'] = record.get('SCRClass')
    record_dict['SupplementalRecordUI'] = record.findtext('SupplementalRecordUI')
    record_dict['SupplementalRecordName'] = record.findtext('SupplementalRecordName/String')
    record_dicts.append(record_dict)

columns = ['SupplementalRecordUI', 'SupplementalRecordName', 'SCRClass']
record_df = pandas.DataFrame(record_dicts)[columns]
record_df.to_csv('data/supplemental-records.tsv', index=False, sep='\t')

In [5]:
# Extract terms and save as tsv
term_dicts = list()
for record in root.findall('SupplementalRecord'):
    for concept in record.findall('ConceptList/Concept'):
        for term in concept.findall('TermList/Term'):
            term_dict = {
                'SupplementalRecordUI': record.findtext('SupplementalRecordUI'),
                'ConceptUI': concept.findtext('ConceptUI'),
                'TermUI': term.findtext('TermUI'),
                'TermName': term.findtext('String')
            }
            term_dict.update(concept.attrib)
            term_dict.update(term.attrib)
            term_dicts.append(term_dict)

columns = ['SupplementalRecordUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',
           'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'PrintFlagYN', 'RecordPreferredTermYN']
term_df = pandas.DataFrame(term_dicts)[columns]
term_df.to_csv('data/supplemental-terms.tsv', index=False, sep='\t')