MeSH Codes
==========

This notebook parses and cleans the health and medical terms from the NIH Medical Subject Headings. The original files can be found on their ftp site [here](ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/).

In [1]:
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

from datetime import datetime

pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 999

In [2]:
%matplotlib inline
#NB I open a standard set of directories

#Paths

#Get the top path
top_path = os.path.dirname(os.getcwd())

#Create the path for external data
ext_data = os.path.join(top_path,'data/external')

#Raw path (for html downloads)

raw_data = os.path.join(top_path,'data/raw')

#And external data
proc_data = os.path.join(top_path,'data/processed')

fig_path = os.path.join(top_path,'reports/figures')

#Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.month, today.day, today.year]])

In [172]:
# Adapted from 
# http://www.austintaylor.io/lxml/python/pandas/xml/dataframe/2016/07/08/convert-xml-to-pandas-dataframe/
# The original did not account for structures where the last children shared names but not parents as 
# occurs in this dataset. This gives messier names, but all the information.

class XML2DataFrame:

    def __init__(self, xml_data):
#         parser = ET.XMLParser(encoding="utf-8")
#         self.root = ET.fromstring(xml_data, parser=parser)
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        return [self.parse_element(child, 'Root') for child in iter(root)]

    def parse_element(self, element, parent_name, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[parent_name + key] = element.attrib.get(key)
        if element.text:
            h_key = parent_name + element.tag
#             if h_key in parsed:
#                 h_key = h_key + '_1'
            parsed[h_key] = element.text
        for child in list(element):
            self.parse_element(child, element.tag, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [173]:
with open(ext_data + '/desc2018.xml', 'r') as f:
    desc_2018_xml = f.read()

xml2df = XML2DataFrame(desc_2018_xml)
desc_2018_df = xml2df.process_data()

In [174]:
desc_2018_df.head()

Unnamed: 0,AllowableQualifierAbbreviation,AllowableQualifierQualifierReferredTo,AllowableQualifiersListAllowableQualifier,ConceptCASN1Name,ConceptConceptName,ConceptConceptRelationList,ConceptConceptUI,ConceptListConcept,ConceptListPreferredConceptYN,ConceptNameString,ConceptRegistryNumber,ConceptRelatedRegistryNumberList,ConceptRelationConcept1UI,ConceptRelationConcept2UI,ConceptRelationListConceptRelation,ConceptRelationListRelationName,ConceptScopeNote,ConceptTermList,DateCreatedDay,DateCreatedMonth,DateCreatedYear,DateEstablishedDay,DateEstablishedMonth,DateEstablishedYear,DateRevisedDay,DateRevisedMonth,DateRevisedYear,DescriptorNameString,DescriptorRecordAllowableQualifiersList,DescriptorRecordAnnotation,DescriptorRecordConceptList,DescriptorRecordConsiderAlso,DescriptorRecordDateCreated,DescriptorRecordDateEstablished,DescriptorRecordDateRevised,DescriptorRecordDescriptorName,DescriptorRecordDescriptorUI,DescriptorRecordEntryCombinationList,DescriptorRecordHistoryNote,DescriptorRecordNLMClassificationNumber,DescriptorRecordOnlineNote,DescriptorRecordPharmacologicalActionList,DescriptorRecordPreviousIndexingList,DescriptorRecordPublicMeSHNote,DescriptorRecordSeeRelatedList,DescriptorRecordTreeNumberList,DescriptorReferredToDescriptorName,DescriptorReferredToDescriptorUI,ECINDescriptorReferredTo,ECINQualifierReferredTo,ECOUTDescriptorReferredTo,ECOUTQualifierReferredTo,EntryCombinationECIN,EntryCombinationECOUT,EntryCombinationListEntryCombination,PharmacologicalActionDescriptorReferredTo,PharmacologicalActionListPharmacologicalAction,PreviousIndexingListPreviousIndexing,QualifierNameString,QualifierReferredToQualifierName,QualifierReferredToQualifierUI,RelatedRegistryNumberListRelatedRegistryNumber,RootDescriptorClass,RootDescriptorRecord,SeeRelatedDescriptorDescriptorReferredTo,SeeRelatedListSeeRelatedDescriptor,TermDateCreated,TermEntryVersion,TermListConceptPreferredTermYN,TermListIsPermutedTermYN,TermListLexicalTag,TermListRecordPreferredTermYN,TermListTerm,TermSortVersion,TermString,TermTermUI,TermThesaurusIDlist,ThesaurusIDlistThesaurusID,TreeNumberListTreeNumber
0,TO,\n,\n,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrro...",\n,\n,M0353609,\n,N,A-23187,0.0,\n,M0000001,M0353609,\n,NRW,"An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports C...",\n,8,3,1990,1,1,1984,27,5,2016,Calcium Ionophores,\n,,\n,,\n,\n,\n,\n,D000001,,91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,use CALCIMYCIN to search A 23187 1975-90\n,\n,\n,91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,\n,\n,D061207,,,,,,,,\n,\n,Carboxylic Acids (1973-1974),toxicity,\n,Q000633,52665-69-7 (Calcimycin),1,\n,,,\n,,N,Y,NON,N,\n,,"A23187, Antibiotic",T000003,\n,NLM (1991),D03.633.100.221.173
1,TO,\n,\n,"Phosphorothioic acid, O,O'-(thiodi-4,1-phenylene) O,O,O',O'-tetramethyl ester",\n,\n,M0352200,\n,N,Difos,0.0,\n,M0000002,M0352200,\n,NRW,An organothiophosphate insecticide.\n,\n,7,10,1986,1,1,1991,8,7,2013,Insecticides,\n,"for use to kill or control insects, use no qualifiers on the insecticide or the insect; appropri...",\n,,\n,\n,\n,\n,D000002,,"96; was ABATE 1972-95 (see under INSECTICIDES, ORGANOTHIOPHOSPHATE 1972-90)\n",,,\n,\n,"96; was ABATE 1972-95 (see under INSECTICIDES, ORGANOTHIOPHOSPHATE 1972-90)\n",,\n,\n,D007306,,,,,,,,\n,\n,Insecticides (1966-1971),toxicity,\n,Q000633,3383-96-8 (Temefos),1,\n,,,\n,,Y,N,TRD,N,\n,,Difos,T000006,\n,UNK (19XX),D02.886.300.692.800
2,ES,\n,\n,,\n,,M0000003,\n,Y,Abattoirs,,,,,,,Places where animals are slaughtered and dressed for market.\n,\n,29,3,1974,1,1,1966,8,6,2016,Abattoirs,\n,,\n,,\n,\n,\n,\n,D000003,,,WA 707,,,,,,\n,,,,,,,,,,,,,ethics,\n,Q000941,,1,\n,,,\n,,N,Y,NON,N,\n,,Slaughterhouse,T000010,\n,UNK (19XX),J03.540.020
3,,,,,\n,\n,M0511063,\n,N,Acronyms as Topic,,,M0000004,M0511063,\n,NRW,Works about shortened forms of written words or phrases used for brevity.\n,\n,29,6,2007,1,1,1960,30,6,2017,Abbreviations as Topic,,includes acronyms; do not confuse with Publication Type ABBREVIATIONS\n,\n,,\n,\n,\n,\n,D000004,,2008(1963)\n,,,,,2008; see ABBREVIATIONS 1963-2007\n,,\n,,,,,,,,,,,,,,,,,1,\n,,,\n,,Y,N,NON,N,\n,,Acronyms as Topic,T701041,\n,NLM (2008),L01.559.598.400.556.131
4,VI,\n,\n,,\n,,M0000005,\n,Y,Abdomen,,,,,,,That portion of the body that lies between the THORAX and the PELVIS.\n,\n,1,1,1999,1,1,1966,9,8,2016,Abdominal Injuries,\n,GEN: prefer specifics; abdom muscles = ABDOMINAL MUSCLES but RECTUS ABDOMINIS is available; abdo...,\n,,\n,\n,\n,\n,D000005,\n,,,,,,,,\n,\n,D000007,\n,\n,\n,,\n,\n,\n,,,,injuries,\n,Q000293,,1,\n,,,\n,,N,Y,NON,N,\n,,Abdomens,T000012,\n,NLM (1966),A01.923.047


In [175]:
desc_2018_df.columns

Index(['AllowableQualifierAbbreviation',
       'AllowableQualifierQualifierReferredTo',
       'AllowableQualifiersListAllowableQualifier', 'ConceptCASN1Name',
       'ConceptConceptName', 'ConceptConceptRelationList', 'ConceptConceptUI',
       'ConceptListConcept', 'ConceptListPreferredConceptYN',
       'ConceptNameString', 'ConceptRegistryNumber',
       'ConceptRelatedRegistryNumberList', 'ConceptRelationConcept1UI',
       'ConceptRelationConcept2UI', 'ConceptRelationListConceptRelation',
       'ConceptRelationListRelationName', 'ConceptScopeNote',
       'ConceptTermList', 'DateCreatedDay', 'DateCreatedMonth',
       'DateCreatedYear', 'DateEstablishedDay', 'DateEstablishedMonth',
       'DateEstablishedYear', 'DateRevisedDay', 'DateRevisedMonth',
       'DateRevisedYear', 'DescriptorNameString',
       'DescriptorRecordAllowableQualifiersList', 'DescriptorRecordAnnotation',
       'DescriptorRecordConceptList', 'DescriptorRecordConsiderAlso',
       'DescriptorRecordDateCreat

In [176]:
desc_2018_df.head(1)

Unnamed: 0,AllowableQualifierAbbreviation,AllowableQualifierQualifierReferredTo,AllowableQualifiersListAllowableQualifier,ConceptCASN1Name,ConceptConceptName,ConceptConceptRelationList,ConceptConceptUI,ConceptListConcept,ConceptListPreferredConceptYN,ConceptNameString,ConceptRegistryNumber,ConceptRelatedRegistryNumberList,ConceptRelationConcept1UI,ConceptRelationConcept2UI,ConceptRelationListConceptRelation,ConceptRelationListRelationName,ConceptScopeNote,ConceptTermList,DateCreatedDay,DateCreatedMonth,DateCreatedYear,DateEstablishedDay,DateEstablishedMonth,DateEstablishedYear,DateRevisedDay,DateRevisedMonth,DateRevisedYear,DescriptorNameString,DescriptorRecordAllowableQualifiersList,DescriptorRecordAnnotation,DescriptorRecordConceptList,DescriptorRecordConsiderAlso,DescriptorRecordDateCreated,DescriptorRecordDateEstablished,DescriptorRecordDateRevised,DescriptorRecordDescriptorName,DescriptorRecordDescriptorUI,DescriptorRecordEntryCombinationList,DescriptorRecordHistoryNote,DescriptorRecordNLMClassificationNumber,DescriptorRecordOnlineNote,DescriptorRecordPharmacologicalActionList,DescriptorRecordPreviousIndexingList,DescriptorRecordPublicMeSHNote,DescriptorRecordSeeRelatedList,DescriptorRecordTreeNumberList,DescriptorReferredToDescriptorName,DescriptorReferredToDescriptorUI,ECINDescriptorReferredTo,ECINQualifierReferredTo,ECOUTDescriptorReferredTo,ECOUTQualifierReferredTo,EntryCombinationECIN,EntryCombinationECOUT,EntryCombinationListEntryCombination,PharmacologicalActionDescriptorReferredTo,PharmacologicalActionListPharmacologicalAction,PreviousIndexingListPreviousIndexing,QualifierNameString,QualifierReferredToQualifierName,QualifierReferredToQualifierUI,RelatedRegistryNumberListRelatedRegistryNumber,RootDescriptorClass,RootDescriptorRecord,SeeRelatedDescriptorDescriptorReferredTo,SeeRelatedListSeeRelatedDescriptor,TermDateCreated,TermEntryVersion,TermListConceptPreferredTermYN,TermListIsPermutedTermYN,TermListLexicalTag,TermListRecordPreferredTermYN,TermListTerm,TermSortVersion,TermString,TermTermUI,TermThesaurusIDlist,ThesaurusIDlistThesaurusID,TreeNumberListTreeNumber
0,TO,\n,\n,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrro...",\n,\n,M0353609,\n,N,A-23187,0,\n,M0000001,M0353609,\n,NRW,"An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports C...",\n,8,3,1990,1,1,1984,27,5,2016,Calcium Ionophores,\n,,\n,,\n,\n,\n,\n,D000001,,91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,use CALCIMYCIN to search A 23187 1975-90\n,\n,\n,91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,\n,\n,D061207,,,,,,,,\n,\n,Carboxylic Acids (1973-1974),toxicity,\n,Q000633,52665-69-7 (Calcimycin),1,\n,,,\n,,N,Y,NON,N,\n,,"A23187, Antibiotic",T000003,\n,NLM (1991),D03.633.100.221.173


In [177]:
desc_2018_df.drop([
       'AllowableQualifierQualifierReferredTo',
       'AllowableQualifiersListAllowableQualifier',
       'ConceptConceptName', 'ConceptConceptRelationList',
       'ConceptListConcept',
       'ConceptRelatedRegistryNumberList', 'ConceptRelationListConceptRelation',
       'DescriptorRecordAllowableQualifiersList',
       'DescriptorRecordConceptList', 
       'DescriptorRecordDateCreated', 'DescriptorRecordDateEstablished',
       'DescriptorRecordDateRevised', 'DescriptorRecordDescriptorName',
       'DescriptorRecordPharmacologicalActionList',
       'DescriptorRecordPreviousIndexingList',
       'DescriptorRecordTreeNumberList', 'DescriptorReferredToDescriptorName',
       'PharmacologicalActionDescriptorReferredTo',
       'PharmacologicalActionListPharmacologicalAction',
       'QualifierReferredToQualifierName',
       'RootDescriptorRecord',
       'TermDateCreated',
       'TermListTerm',
       'TermThesaurusIDlist','ECINDescriptorReferredTo',
       'ECINQualifierReferredTo',
       'ECOUTDescriptorReferredTo',
       'ECOUTQualifierReferredTo',
       'EntryCombinationECIN',
       'EntryCombinationECOUT'],
        axis=1, inplace=True)

In [178]:
desc_2018_df.head(1)

Unnamed: 0,AllowableQualifierAbbreviation,ConceptCASN1Name,ConceptConceptUI,ConceptListPreferredConceptYN,ConceptNameString,ConceptRegistryNumber,ConceptRelationConcept1UI,ConceptRelationConcept2UI,ConceptRelationListRelationName,ConceptScopeNote,ConceptTermList,DateCreatedDay,DateCreatedMonth,DateCreatedYear,DateEstablishedDay,DateEstablishedMonth,DateEstablishedYear,DateRevisedDay,DateRevisedMonth,DateRevisedYear,DescriptorNameString,DescriptorRecordAnnotation,DescriptorRecordConsiderAlso,DescriptorRecordDescriptorUI,DescriptorRecordEntryCombinationList,DescriptorRecordHistoryNote,DescriptorRecordNLMClassificationNumber,DescriptorRecordOnlineNote,DescriptorRecordPublicMeSHNote,DescriptorRecordSeeRelatedList,DescriptorReferredToDescriptorUI,EntryCombinationListEntryCombination,PreviousIndexingListPreviousIndexing,QualifierNameString,QualifierReferredToQualifierUI,RelatedRegistryNumberListRelatedRegistryNumber,RootDescriptorClass,SeeRelatedDescriptorDescriptorReferredTo,SeeRelatedListSeeRelatedDescriptor,TermEntryVersion,TermListConceptPreferredTermYN,TermListIsPermutedTermYN,TermListLexicalTag,TermListRecordPreferredTermYN,TermSortVersion,TermString,TermTermUI,ThesaurusIDlistThesaurusID,TreeNumberListTreeNumber
0,TO,"4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrro...",M0353609,N,A-23187,0,M0000001,M0353609,NRW,"An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports C...",\n,8,3,1990,1,1,1984,27,5,2016,Calcium Ionophores,,,D000001,,91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,use CALCIMYCIN to search A 23187 1975-90\n,91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83)\n,,D061207,,Carboxylic Acids (1973-1974),toxicity,Q000633,52665-69-7 (Calcimycin),1,,,,N,Y,NON,N,,"A23187, Antibiotic",T000003,NLM (1991),D03.633.100.221.173


In [179]:
desc_2018_df.rename(columns={'AllowableQualifierAbbreviation': 'QualifierAbbreviation',
                            'ConceptConceptUI': 'ConceptUI',
                            'ConceptListPreferredConceptYN': 'PreferredConceptYN',
                            'ConceptRelationConcept1UI': 'Concept1UI',
                            'ConceptRelationConcept1UI': 'Concept2UI',
                            'ConceptRelationListRelationName' : 'ConceptRelationName',
                            'PreviousIndexingListPreviousIndexing': 'PreviousIndexing',
                            'EntryCombinationListEntryCombination': 'EntryCombination',
                            'RelatedRegistryNumberListRelatedRegistryNumber': 'RelatedRegistryNumber',
                            'SeeRelatedDescriptorDescriptorReferredTo': 'DescriptorReferredTo',
                            'SeeRelatedListSeeRelatedDescriptor': 'SeeRelatedDescriptor',
                            'TermListConceptPreferredTermYN': 'PreferredTermYN',
                            'TermListIsPermutedTermYN': 'IsPermutedTermYN',
                            'ThesaurusIDlistThesaurusID': 'ThesaurusID',
                            'TreeNumberListTreeNumber': 'TreeNumber'}, inplace=True)

In [180]:
# desc_2018_df['TreeNumber'][pd.isnull(desc_2018_df['TreeNumber'])] = ['U01', 'U02']
desc_2018_df = desc_2018_df[~pd.isnull(desc_2018_df['TreeNumber'])]

MeSH codes resemble the format "A01.343.124.243" with up to 12 levels, and where the first letter denotes the coarsest category. We want to know the position in the hierarchy for each word, so we count the number of splits in the code for each term.

In [181]:
code_splits = []

for c in desc_2018_df['TreeNumber'].str.split('.'):
    code_splits.append(c)

In [182]:
# mesh_tree_codes = ['.'.join(c) for c in code_splits]
code_lengths = [len(c) for c in code_splits]
max_code_length = max(code_lengths)
# desc_2018_df['MeshTreeCode'] = mesh_tree_codes

In [183]:
print(max_code_length)

13


In [184]:
# reset

# for c in desc_2018_df.columns:
#     if 'tree' in c:
#         desc_2018_df.drop(c, axis=1, inplace=True)

In [185]:
desc_2018_df['tree_number_0'] = [c[0][0] for c in code_splits]

In [186]:
code_splits[200]

['D12', '776', '664', '962', '813', '500', '875']

Let's add columns for each code order, so we can group terms together under common codes later.

In [187]:
for i in range(1, max_code_length):
    tree_lvl_codes = []
    for c in code_splits:
        if len(c) >= i:
            tree_lvl_codes.append('.'.join(c[:i]))
        else:
            tree_lvl_codes.append(np.nan)
    desc_2018_df['tree_number_{}'.format(i)] = tree_lvl_codes

We want to map the codes to actual terms, so starting with the 0th level, we map terms obtained manually from the MeSH website.

In [188]:
# from https://meshb.nlm.nih.gov/treeView
tree_0_map = {
    'A': 'anatomy',
    'B': 'organisms',
    'C': 'diseases',
    'D': 'chemicals and drugs',
    'E': 'analytical, diagnostic, and therapeutic techniques, and equipment',
    'F': 'psychiatry and psychology',
    'G': 'phenomena and processes',
    'H': 'disciplines and occupations',
    'I': 'anthropology, education, sociology, and social phenomena',
    'J': 'technology, industry, and agriculture',
    'K': 'humanities',
    'L': 'information science',
    'M': 'named groups',
    'N': 'health care',
    'V': 'publication characteristics',
    'Z': 'geographicals'
}

In [189]:
desc_2018_df['tree_string_0'] = desc_2018_df['tree_number_0'].map(tree_0_map)

Some of the original strings are reversed using commas. To help matching in the documents we should put them in correct order.

In [190]:
desc_2018_df.to_csv(proc_data + '/mesh_codes_cleaned_{}.csv'.format(today_str), index=False)
# desc_2018_df = pd.read_csv(proc_data + '/mesh_codes_cleaned_{}.csv'.format('5_3_2018')).drop('Unnamed: 0', axis=1)

In [191]:
def process_string(string):
    string = string.split(', ')
    string = ' '.join(string[::-1])
    return string.lower()

In [192]:
for c in desc_2018_df.columns:
    if 'String' in c:
        print(c)

ConceptNameString
DescriptorNameString
QualifierNameString
TermString


In [193]:
desc_2018_df['ConceptNameString'][:10]

0                  A-23187
1                    Difos
2                Abattoirs
3        Acronyms as Topic
4                  Abdomen
5           Abdomen, Acute
6       Abdominal Injuries
7      Abdominal Neoplasms
8    Transversus Abdominis
9           Abducens Nerve
Name: ConceptNameString, dtype: object

In [194]:
desc_2018_df['DescriptorNameString'][:10]

0        Calcium Ionophores
1              Insecticides
2                 Abattoirs
3    Abbreviations as Topic
4        Abdominal Injuries
5            Abdomen, Acute
6        Abdominal Injuries
7       Abdominal Neoplasms
8            Abdominal Wall
9          Abducens Nucleus
Name: DescriptorNameString, dtype: object

In [195]:
desc_2018_df['QualifierNameString'][:10]

0          toxicity
1          toxicity
2            ethics
3               NaN
4          injuries
5           nursing
6         pathology
7    ultrastructure
8    ultrastructure
9          injuries
Name: QualifierNameString, dtype: object

In [196]:
desc_2018_df['TermString'][:10]

0       A23187, Antibiotic
1                    Difos
2           Slaughterhouse
3        Acronyms as Topic
4                 Abdomens
5           Acute Abdomens
6        Injury, Abdominal
7     Neoplasms, Abdominal
8    Transverse Abdominals
9       Nerve VIs, Cranial
Name: TermString, dtype: object

In [197]:
desc_2018_df['ConceptStringProcessed'] = desc_2018_df['ConceptNameString'].apply(lambda x: process_string(x))
desc_2018_df['DescriptorStringProcessed'] = desc_2018_df['DescriptorNameString'].apply(lambda x: process_string(x))
# desc_2018_df['QualifierStringProcessed'] = desc_2018_df['QualifierNameString'].apply(lambda x: process_string(x))
desc_2018_df['TermStringProcessed'] = desc_2018_df['TermString'].apply(lambda x: process_string(x))

For each level, take the tree codes and the processed strings, but only for the ones where the next level up is NaN. This means that only ones which finish at this level of the tree are taken. Set the index of the dataframe to the tree codes and convert to a dict that maps codes to strings. Map that dict on to the codes for the next level up.

In [198]:
def expand_string_tree(df, string_column):
    for i in range(1, max_code_length - 1):
        tree_name_map = desc_2018_df[['TreeNumber', string_column]][pd.isnull(desc_2018_df['tree_number_{}'.format(i + 1)])].set_index('TreeNumber').to_dict()
        tree_name_map = tree_name_map[string_column]
        tree_name_map.pop(np.nan, None)
        desc_2018_df['tree_{}_{}'.format(string_column, i)] = desc_2018_df['tree_number_{}'.format(i)].map(tree_name_map, na_action='ignore')
    desc_2018_df['tree_{}_{}'.format(string_column, max_code_length - 1)] = np.nan
    return df

In [199]:
for c in ['ConceptStringProcessed', 'DescriptorStringProcessed', 'TermStringProcessed']:
    desc_2018_df = expand_string_tree(desc_2018_df, c)

In [168]:
# desc_2018_df.to_csv(proc_data + '/mesh_codes_cleaned_{}.csv'.format(today_str), index=False)

After this there are some broken codes, due to duplicate entries in the tree, but these are relatively few in number.

In [202]:
desc_2018_df['tree_order'] = code_lengths

Finally export as a json.

In [203]:
reoriented = desc_2018_df.set_index('ConceptStringProcessed')

In [208]:
concept_string_dict = reoriented.to_dict(orient='index')

In [211]:
reoriented.to_json(proc_data + '/mesh_codes_processed_{}.json'.format(today_str), orient='index')

Need to do a second iteration of this where the tree is not built on one of the terms, but rather the tree numbers.

Possible structure that we might want to obtain later:

```
{'A': {'level': 0,
       'term': 'humans',
       'children': {'A01': {...
                           }
                    ...
                   }
       ... 
      }
 ...
}
                   
```

In [6]:
reoriented = desc_2018_df.set_index('TreeNumber')

In [7]:
concept_string_dict = reoriented.to_dict(orient='index')

In [8]:
reoriented.to_json(proc_data + '/mesh_codes_processed_tree_number_{}.json'.format(today_str), orient='index')

In [3]:
desc_2018_df = pd.read_json('../data/processed/mesh_codes_processed_5_4_2018.json')

In [17]:
desc_2018_df.set_index('TreeNumber').to_json('../data/processed/mesh_codes_processed_5_8_2018.json', orient='index')

In [25]:
desc_2018_df_2[desc_2018_df_2['TermString'].str.contains('informat')]

Unnamed: 0,Concept2UI,ConceptCASN1Name,ConceptNameString,ConceptRegistryNumber,ConceptRelationConcept2UI,ConceptRelationName,ConceptScopeNote,ConceptTermList,ConceptUI,DateCreatedDay,DateCreatedMonth,DateCreatedYear,DateEstablishedDay,DateEstablishedMonth,DateEstablishedYear,DateRevisedDay,DateRevisedMonth,DateRevisedYear,DescriptorNameString,DescriptorRecordAnnotation,DescriptorRecordConsiderAlso,DescriptorRecordDescriptorUI,DescriptorRecordEntryCombinationList,DescriptorRecordHistoryNote,DescriptorRecordNLMClassificationNumber,DescriptorRecordOnlineNote,DescriptorRecordPublicMeSHNote,DescriptorRecordSeeRelatedList,DescriptorReferredTo,DescriptorReferredToDescriptorUI,DescriptorStringProcessed,EntryCombination,IsPermutedTermYN,PreferredConceptYN,PreferredTermYN,PreviousIndexing,QualifierAbbreviation,QualifierNameString,QualifierReferredToQualifierUI,RelatedRegistryNumber,RootDescriptorClass,SeeRelatedDescriptor,TermEntryVersion,TermListLexicalTag,TermListRecordPreferredTermYN,TermSortVersion,TermString,TermStringProcessed,TermTermUI,ThesaurusID,TreeNumber,tree_ConceptStringProcessed_1,tree_ConceptStringProcessed_10,tree_ConceptStringProcessed_11,tree_ConceptStringProcessed_12,tree_ConceptStringProcessed_2,tree_ConceptStringProcessed_3,tree_ConceptStringProcessed_4,tree_ConceptStringProcessed_5,tree_ConceptStringProcessed_6,tree_ConceptStringProcessed_7,tree_ConceptStringProcessed_8,tree_ConceptStringProcessed_9,tree_DescriptorStringProcessed_1,tree_DescriptorStringProcessed_10,tree_DescriptorStringProcessed_11,tree_DescriptorStringProcessed_12,tree_DescriptorStringProcessed_2,tree_DescriptorStringProcessed_3,tree_DescriptorStringProcessed_4,tree_DescriptorStringProcessed_5,tree_DescriptorStringProcessed_6,tree_DescriptorStringProcessed_7,tree_DescriptorStringProcessed_8,tree_DescriptorStringProcessed_9,tree_TermStringProcessed_1,tree_TermStringProcessed_10,tree_TermStringProcessed_11,tree_TermStringProcessed_12,tree_TermStringProcessed_2,tree_TermStringProcessed_3,tree_TermStringProcessed_4,tree_TermStringProcessed_5,tree_TermStringProcessed_6,tree_TermStringProcessed_7,tree_TermStringProcessed_8,tree_TermStringProcessed_9,tree_number_0,tree_number_1,tree_number_10,tree_number_11,tree_number_12,tree_number_2,tree_number_3,tree_number_4,tree_number_5,tree_number_6,tree_number_7,tree_number_8,tree_number_9,tree_order,tree_string_0
bio-informatics,M0028727,,Bio-Informatics,,M0424650,NRW,A field of biology concerned with the development of techniques for the collection and manipulat...,\n,M0424650,6,12,1995,1,1,1997,16,6,2014,Medical Informatics,use for the discipline and as a coordinate for bioinformatics studies\n,,D019295,,97\n,,,97\n,\n,\n,D008490,medical informatics,,Y,N,N,Molecular Biology (1992-1996),ES,ethics,Q000941,,1,\n,MOL BIOL COMPUTIONAL,NON,N,,Bioinformatic,bioinformatic,T057482,NLM (1997),L01.313.124,information science,,,,informatics,bio-informatics,,,,,,,information science,,,,informatics,medical informatics,,,,,,,information sciences,,,,informatics,bioinformatic,,,,,,,L,L01,,,,L01.313,L01.313.124,,,,,,,3,information science
