In [None]:
import pandas as pd
import xmltodict
from tqdm import tqdm
from IPython.display import display, HTML


def runProcess():
    print('--> Computing data ...')

    ## SELECT ACTIVES ENTITIES
    xml_data = getData('../xml/en_product1.xml')
    n_full_entities, actives_entities, inactives_entities = getActivesEntities(xml_data)
    ## COUNT NOMENCLATURE
    dict_results = countEntities(n_full_entities,inactives_entities,actives_entities)

    myDataframe = pd.DataFrame(dict_results)
    with pd.ExcelWriter('../output_tables/CountNomenclature.xlsx', engine='xlsxwriter') as writer:
        myDataframe.to_excel(writer,index=False)
    display(HTML(myDataframe.to_html(index=False)))

def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :return: xml parsed as dict
    """
    with open(xmlfile, "r", encoding='ISO-8859-1') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict


def getActivesEntities(xml_data):
    """
    filter out inactive entities highlighting flag 8192
    :param xml_data:
    :return:
    """
    n_full_entities = 0
    inactives_entities = []
    actives_entities = []
    for entity in tqdm(xml_data["JDBOR"]["DisorderList"]["Disorder"]):
        n_full_entities +=1
        DisorderFlagList = entity['DisorderFlagList']['DisorderFlag']
        if isinstance(DisorderFlagList, dict):
            if DisorderFlagList['Value'] != '8192':
                actives_entities.append(entity)
            else:
                inactives_entities.append(entity)
        else:
            DisorderFlags = []
            for DisorderFlag in DisorderFlagList:
                DisorderFlags.append(DisorderFlag['Value'])
            if not '8192' in DisorderFlags:
                actives_entities.append(entity)
            else:
                inactives_entities.append(entity)
    return n_full_entities, actives_entities,inactives_entities


def countEntities(n_entities, inactives_entities, actives_entities):
    """
    keeping only disorders (with flag 36547)
    :param actives_entities:
    :return:
    """
    groups_of_disorders = []
    disorders = []
    subtypes_of_disorders = []

    for entity in tqdm(actives_entities):
        if entity['DisorderGroup']['@id'] == '36540':
            groups_of_disorders.append(entity)
        elif entity['DisorderGroup']['@id'] == '36547':
            disorders.append(entity)
        elif entity['DisorderGroup']['@id'] == '36554':
            subtypes_of_disorders.append(entity)

    n_inactives = len(inactives_entities)
    n_actives = len(actives_entities)
    n_groups_of_disorders = len(groups_of_disorders)
    n_disorders = len(disorders)
    n_subtypes_of_disorders = len(subtypes_of_disorders)
    percent_inactives = str(round(((n_inactives * 100) / n_entities), 2))
    percent_actives = str(round(((n_actives * 100) / n_entities), 2))
    percent_groups_of_disorders = str(round(((n_groups_of_disorders * 100) / n_actives), 2))
    percent_disorders = str(round(((n_disorders * 100) / n_actives), 2))
    percent_subtypes_of_disorders = str(round(((n_subtypes_of_disorders * 100) / n_actives), 2))


    dict_results={}
    dict_results['Nomenclature'] = ['Orphanet Entities' , 'Inactives Entities', 'Actives Entities', 'Actives Groups of Disorders', 'Actives Disorders', 'Actives Subtypes of Disorders']
    dict_results['Amount'] = [n_entities , n_inactives, n_actives, n_groups_of_disorders, n_disorders, n_subtypes_of_disorders]
    dict_results['%'] = ['-', percent_inactives, percent_actives, percent_groups_of_disorders, percent_disorders, percent_subtypes_of_disorders]
    return dict_results


if __name__ == '__main__':
    runProcess()