In [138]:
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

from datetime import datetime

pd.options.display.max_colwidth = 100

In [101]:
%matplotlib inline
#NB I open a standard set of directories

#Paths

#Get the top path
top_path = os.path.dirname(os.getcwd())

#Create the path for external data
ext_data = os.path.join(top_path,'data/external')

#Raw path (for html downloads)

raw_data = os.path.join(top_path,'data/raw')

#And external data
proc_data = os.path.join(top_path,'data/processed')

fig_path = os.path.join(top_path,'reports/figures')

#Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.month, today.day, today.year]])

In [102]:
# taken from http://www.austintaylor.io/lxml/python/pandas/xml/dataframe/2016/07/08/convert-xml-to-pandas-dataframe/
class XML2DataFrame:

    def __init__(self, xml_data):
#         parser = ET.XMLParser(encoding="utf-8")
#         self.root = ET.fromstring(xml_data, parser=parser)
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.tag] = element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [200]:
with open(ext_data + '/desc2018', 'r') as f:
    desc_2018_xml = f.read()

xml2df = XML2DataFrame(desc_2018_xml)
desc_2018_df = xml2df.process_data()

In [201]:
desc_2018_df.head()

Unnamed: 0,Abbreviation,AllowableQualifier,AllowableQualifiersList,Annotation,CASN1Name,Concept,Concept1UI,Concept2UI,ConceptList,ConceptName,...,SortVersion,String,Term,TermList,TermUI,ThesaurusID,ThesaurusIDlist,TreeNumber,TreeNumberList,Year
0,TO,\n,\n,,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrro...",\n,M0000001,M0353609,\n,\n,...,,"A23187, Antibiotic",\n,\n,T000003,NLM (1991),\n,D03.633.100.221.173,\n,1990
1,TO,\n,\n,"for use to kill or control insects, use no qualifiers on the insecticide or the insect; appropri...","Phosphorothioic acid, O,O'-(thiodi-4,1-phenylene) O,O,O',O'-tetramethyl ester",\n,M0000002,M0352200,\n,\n,...,,Difos,\n,\n,T000006,UNK (19XX),\n,D02.886.300.692.800,\n,1986
2,ES,\n,\n,,,\n,,,\n,\n,...,,Slaughterhouse,\n,\n,T000010,UNK (19XX),\n,J03.540.020,\n,1974
3,,,,includes acronyms; do not confuse with Publication Type ABBREVIATIONS\n,,\n,M0000004,M0511063,\n,\n,...,,Acronyms as Topic,\n,\n,T701041,NLM (2008),\n,L01.559.598.400.556.131,\n,2007
4,VI,\n,\n,GEN: prefer specifics; abdom muscles = ABDOMINAL MUSCLES but RECTUS ABDOMINIS is available; abdo...,,\n,,,\n,\n,...,,Abdomens,\n,\n,T000012,NLM (1966),\n,A01.923.047,\n,1999


In [203]:
desc_2018_df = desc_2018_df[~pd.isnull(desc_2018_df['TreeNumber'])]

In [204]:
code_splits = []

for c in desc_2018_df['TreeNumber'].str.split('.'):
#     if type(c) == list:
#         code_splits.append(c)
#     else:
#         # Only the words 'Males' and 'Females' have no codes
#         code_splits.append(['Z'])
    code_splits.append(c)

In [210]:
mesh_tree_codes = ['.'.join(c) for c in code_splits]
max_code_length = max([len(c) for c in code_splits])
desc_2018_df['MeshTreeCode'] = mesh_tree_codes

In [211]:
print(max_code_length)

13


Going to truncate the hierarchy at the 5th level.

In [228]:
desc_2018_df['tree_0'] = [c[0][0] for c in code_splits]

In [224]:
code_splits[0]

['D03', '633', '100', '221', '173']

In [245]:
for i in range(0, max_code_length):
    tree_lvl_codes = []
    for c in code_splits:
        if len(c) >= i + 1:
            tree_lvl_codes.append(c[i])
        else:
            tree_lvl_codes.append(np.nan)
    desc_2018_df['tree_{}'.format(i + 1)] = tree_lvl_codes

In [249]:
desc_2018_df.columns

Index(['Abbreviation', 'AllowableQualifier', 'AllowableQualifiersList',
       'Annotation', 'CASN1Name', 'Concept', 'Concept1UI', 'Concept2UI',
       'ConceptList', 'ConceptName', 'ConceptPreferredTermYN',
       'ConceptRelation', 'ConceptRelationList', 'ConceptUI', 'ConsiderAlso',
       'DateCreated', 'DateEstablished', 'DateRevised', 'Day',
       'DescriptorClass', 'DescriptorName', 'DescriptorRecord',
       'DescriptorReferredTo', 'DescriptorUI', 'ECIN', 'ECOUT',
       'EntryCombination', 'EntryCombinationList', 'EntryVersion',
       'HistoryNote', 'IsPermutedTermYN', 'LexicalTag', 'Month',
       'NLMClassificationNumber', 'OnlineNote', 'PharmacologicalAction',
       'PharmacologicalActionList', 'PreferredConceptYN', 'PreviousIndexing',
       'PreviousIndexingList', 'PublicMeSHNote', 'QualifierName',
       'QualifierReferredTo', 'QualifierUI', 'RecordPreferredTermYN',
       'RegistryNumber', 'RelatedRegistryNumber', 'RelatedRegistryNumberList',
       'RelationName', 'S

In [250]:
desc_2018_df.drop(['AllowableQualifier', 'AllowableQualifiersList', 'TreeNumberList', 'TreeNumber', 'ThesaurusIDlist', 'TermList', 'Term',
                   'SortVersion', 'SeeRelatedList', 'SeeRelatedDescriptor', 'RelatedRegistryNumberList', 'RegistryNumber',
                   'QualifierReferredTo', 'QualifierName', 'PreviousIndexingList', 'PharmacologicalActionList', 'PharmacologicalAction',
                   'EntryCombinationList', 'EntryCombination', 'ECOUT', 'ECIN', 'DescriptorReferredTo', 'DescriptorRecord', 'DescriptorName',
                   'DateRevised', 'DateEstablished', 'DateCreated', 'ConsiderAlso', 'ConceptRelationList', 'ConceptRelation',
                   'ConceptName', 'ConceptList', 'Concept'], axis=1, inplace=True)

In [251]:
desc_2018_df.head()

Unnamed: 0,Abbreviation,Annotation,CASN1Name,Concept1UI,Concept2UI,ConceptPreferredTermYN,ConceptUI,Day,DescriptorClass,DescriptorUI,...,tree_5,tree_6,tree_7,tree_8,tree_9,tree_10,tree_11,tree_12,tree_13,tree_14
0,TO,,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrro...",M0000001,M0353609,N,M0353609,8,1,D061207,...,173.0,,,,,,,,,
1,TO,"for use to kill or control insects, use no qualifiers on the insecticide or the insect; appropri...","Phosphorothioic acid, O,O'-(thiodi-4,1-phenylene) O,O,O',O'-tetramethyl ester",M0000002,M0352200,Y,M0352200,7,1,D007306,...,800.0,,,,,,,,,
2,ES,,,,,N,M0000003,29,1,D000003,...,,,,,,,,,,
3,,includes acronyms; do not confuse with Publication Type ABBREVIATIONS\n,,M0000004,M0511063,Y,M0511063,29,1,D000004,...,556.0,131.0,,,,,,,,
4,VI,GEN: prefer specifics; abdom muscles = ABDOMINAL MUSCLES but RECTUS ABDOMINIS is available; abdo...,,,,N,M0000005,1,1,D000007,...,,,,,,,,,,


In [124]:
# from https://meshb.nlm.nih.gov/treeView
tree_0_map = {
    'A': 'anatomy',
    'B': 'organisms',
    'C': 'diseases',
    'D': 'chemicals and drugs',
    'E': 'analytical, diagnostic, and therapeutic techniques, and equipment',
    'F': 'psychiatry and psychology',
    'G': 'phenomena and processes',
    'H': 'disciplines and occupations',
    'I': 'anthropology, education, sociology, and social phenomena',
    'J': 'technology, industry, and agriculture',
    'K': 'humanities',
    'L': 'information science',
    'M': 'named groups',
    'N': 'health care',
    'V': 'publication characteristics',
    'Z': 'geographicals'
}

In [252]:
desc_2018_df['tree_0_string'] = desc_2018_df['tree_0'].map(tree_0_map)

Some of the original strings are reversed using commas. To help matching in the documents we should put them in correct order.

In [254]:
def comma_rearrange(string):
    string = string.split(', ')
    string = ' '.join(string[::-1])
    return string.lower()

In [255]:
desc_2018_df['StringProcessed'] = desc_2018_df['String'].apply(lambda x: comma_rearrange(x))

In [275]:
for i in range(1, max_code_length):
    tree_name_map = desc_2018_df[['tree_{}'.format(i), 'StringProcessed']][pd.isnull(desc_2018_df['tree_{}'.format(i + 1)])].set_index('tree_{}'.format(i)).to_dict()
    tree_name_map = tree_name_map['StringProcessed']
    tree_name_map.pop(np.nan, None)
    desc_2018_df['tree_{}_string'.format(i)] = desc_2018_df['tree_{}'.format(i)].map(tree_name_map, na_action='ignore')

In [280]:
desc_2018_df.to_csv(proc_data + '/mesh_codes_processed_{}.csv'.format(today_str))

In [None]:
pd.read_csv(proc_data + /)