In [5]:
import pandas as pd
import xmltodict
from IPython.display import display, Markdown




def runProcess():

    dict_results = {}
    dict_results[''] = ['Actives Eentities', 'Disorders', 'Disorders With a Definition', '%']
    dict_language = { 'en' : 'English',
                      'cs' : 'Czech',
                      'de' : 'German',
                      'es' : 'Spanish',
                      'fr' : 'French',
                      'it' : 'Italian',
                      'nl' : 'Dutch',
                      'pl' : 'Polish',
                      'pt' : 'Portuguese'
                      }

    for iso,language in dict_language.items():
        dict_results[language] = []
        print('--> Computing {} data ...'.format(language))
        xml_data = getData('xml/' + iso + '_product1.xml')
        ## SELECT ACTIVE ENTITIES
        actives_entities = getActivesEntities(xml_data)
        #print('Number of actives entities: ' + str(len(actives_entities)))
        dict_results[language].append(actives_entities)
        ## SELECT DISORDERS
        disorders = getDisorders(actives_entities)
        n_disorders = len(disorders)
        dict_results[language].append(n_disorders)
        #print('Number of disorders: ' + str(n_disorders))
        ## SELECT DISORDERS WITH A DEFINITION
        disordersWithDefinition = getDisordersWithDefinition(disorders)
        n_disordersWithDefinition = len(disordersWithDefinition)
        dict_results[language].append(n_disordersWithDefinition)
        percent_n_disordersWithDefinition = round((n_disordersWithDefinition / n_disorders)*100, 2)
        dict_results[language].append(percent_n_disordersWithDefinition)
        #print('Number of disorders with a definition in {} : {} ({}%)'.format(language, str(n_disordersWithDefinition), str(percent_n_disordersWithDefinition)))
        #print()
    myDataframe = pd.DataFrame(dict_results)
    myDataframe = myDataframe[1:]
    display(myDataframe.head(len(dict_results[''])))
    
    
def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :return: xml parsed as dict
    """
    with open(xmlfile, "r", encoding='ISO-8859-1') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict

def getActivesEntities(xml_data):
    """
    filter out inactive entities highlighting flag 8192
    :param xml_data:
    :return:
    """
    actives_entities = []
    for entity in xml_data["JDBOR"]["DisorderList"]["Disorder"]:
        DisorderFlagList = entity['DisorderFlagList']['DisorderFlag']
        if isinstance(DisorderFlagList, dict):
            if DisorderFlagList['Value'] != '8192':
                actives_entities.append(entity)
        else:
            DisorderFlags = []
            for DisorderFlag in DisorderFlagList:
                DisorderFlags.append(DisorderFlag['Value'])
            if not '8192' in DisorderFlags:
                actives_entities.append(entity)
    return actives_entities


def getDisorders(actives_entities):
    """
    keeping only disorders (with flag 36547)
    :param actives_entities:
    :return:
    """
    disorders = []
    for entity in actives_entities:
        if entity['DisorderGroup']['@id'] == '36547':
            disorders.append(entity)
    return disorders




def getDisordersWithDefinition(disorders):
    """
    Calculation of numbers of disorders with a definition
    # We look for the disorders that have a definition in their datas
    :param disorders:
    :return:
    """
    disordersWithDefinition = []
    for disorder in disorders:
        try:
            if disorder["SummaryInformationList"]['@count'] != '0':
                try:
                    if disorder["SummaryInformationList"]['SummaryInformation']['TextSectionList']['@count'] != '0':
                        if disorder["SummaryInformationList"]['SummaryInformation']['TextSectionList']['TextSection']['TextSectionType']['@id'] == '16907':
                              disordersWithDefinition.append(disorder)
                except:
                    pass
        except:
            pass
    return disordersWithDefinition

if __name__ == '__main__':
    runProcess()

--> Computing English data ...
--> Computing Czech data ...
--> Computing German data ...
--> Computing Spanish data ...
--> Computing French data ...
--> Computing Italian data ...
--> Computing Dutch data ...
--> Computing Polish data ...
--> Computing Portuguese data ...


Unnamed: 0,Unnamed: 1,English,Czech,German,Spanish,French,Italian,Dutch,Polish,Portuguese
1,Disorders,6227.0,6227.0,6227.0,6227.0,6227.0,6227.0,6227.0,6227.0,6227.0
2,Disorders With a Definition,5948.0,1.0,2203.0,5217.0,4294.0,3395.0,5734.0,719.0,833.0
3,%,95.52,0.02,35.38,83.78,68.96,54.52,92.08,11.55,13.38
