In [None]:
from IPython.display import display, HTML
from tqdm import tqdm
import pandas as pd
import xmltodict
import numpy as np


dict_language = {'en': 'English',
                 'cs': 'Czech',
                 'de': 'German',
                 'es': 'Spanish',
                 'fr': 'French',
                 'it': 'Italian',
                 'nl': 'Dutch',
                 'pl': 'Polish',
                 'pt': 'Portuguese'
                 }


list_entries_synonymes_table = [ 'Orphanet active entities',
                                 'Total synonyms in Orphanet active entities',
                                 'Orphanet active entities with at least one synonym',
                                 'Orphanet active entities without synonym',
                                 '% of Orphanet active entities with at least one synonym',
                                 '% of Orphanet active entities without synonym',
                                 'Median number of synonyms in Orphanet active entities',
                                 'Mean number of synonyms in Orphanet active entities',
                                 'Mean number of synonyms in Orphanet active entities with at least one synonym',
                                 'Maximum number of synonyms in Orphanet active entities',
                                 '',
                                 '',
                                 'Orphanet groups of disorders',
                                 'Total synonyms in Orphanet groups of disorders',
                                 'Orphanet groups of disorders with at least one synonym',
                                 'Orphanet groups of disorders without synonym',
                                 '% of Orphanet groups of disorders with at least one synonym',
                                 '% of Orphanet groups of disorders without synonym',
                                 'Median number of synonyms in Orphanet groups of disorders ',
                                 'Mean number of synonyms in Orphanet groups of disorders',
                                 'Mean number of synonyms in Orphanet groups of disorders with at least one synonym',
                                 'Maximum number of synonyms in Orphanet groups of disorders ',
                                 '',
                                 '',
                                 'Orphanet disorders',
                                 'Total synonyms in Orphanet disorders',
                                 'Orphanet disorders with at least one synonym',
                                 'Orphanet disorders without synonym',
                                 '% of Orphanet disorders with at least one synonym',
                                 '% of Orphanet disorders without synonym',
                                 'Median number of synonyms in Orphanet disorders',
                                 'Mean number of synonyms in Orphanet disorders',
                                 'Mean number of synonyms in Orphanet disorders with at least one synonym',
                                 'Maximum number of synonyms in Orphanet disorders',
                                 '',
                                 '',
                                 'Orphanet subtypes of disorders',
                                 'Total synonyms in Orphanet subtypes of disorders',
                                 'Orphanet subtypes of disorders with at least one synonym',
                                 'Orphanet subtypes of disorders without synonym',
                                 '% of Orphanet subtypes of disorders with at least one synonym',
                                 '% of Orphanet subtypes of disorders without synonym',
                                 'Median number of synonyms in Orphanet subtypes of disorders',
                                 'Mean number of synonyms in Orphanet subtypes of disorders',
                                 'Mean number of synonyms in Orphanet subtypes of disorders with at least one synonym',
                                 'Maximum number of synonyms in Orphanet subtypes of disorders',
                               ]


def runProcess():
    print('--> Computing data ...')
    dict_results = {}
    dict_results[''] = list_entries_synonymes_table

    for iso, language in tqdm(dict_language.items()):
        dict_results[language] = []
        xml_data = getData('../xml/' + iso + '_product1.xml')

        ## active
        active_entities = getactiveEntities(xml_data)
        n_active_entities = len(active_entities)
        dict_results[language].append(int(n_active_entities))
        n_all_entities_with_no_synonyms, n_entitiesWithSynonyms, total_number_of_synonyms, medianSyns, MaxSyns, avg_syn, avg_syn_with_one_syn = getSynonymsStatistics(
            active_entities)
        percent_n_activeWithSynonyms = str(round((n_entitiesWithSynonyms / n_active_entities) * 100, 2))
        percent_n_activeAnySynonyms = str(round((n_all_entities_with_no_synonyms / n_active_entities) * 100, 2))
        dict_results[language].append(int(total_number_of_synonyms))
        dict_results[language].append(int(n_entitiesWithSynonyms))
        dict_results[language].append(int(n_all_entities_with_no_synonyms))
        dict_results[language].append(percent_n_activeWithSynonyms)
        dict_results[language].append(percent_n_activeAnySynonyms)
        dict_results[language].append(int(medianSyns))
        dict_results[language].append(avg_syn)
        dict_results[language].append(avg_syn_with_one_syn)
        dict_results[language].append(int(MaxSyns))
        dict_results[language].append('')
        dict_results[language].append('')

        ## GROUPS OF DISORDERS
        groupsOfDisorders = getGroupsOfDisorders(active_entities)
        n_groupsOfDisorders = len(groupsOfDisorders)
        dict_results[language].append(n_groupsOfDisorders)
        n_all_groups_with_no_synonyms, n_groupsWithSynonyms, total_number_of_synonyms, medianSyns, MaxSyns, avg_syn, avg_syn_with_one_syn = getSynonymsStatistics(
            groupsOfDisorders)
        percent_n_groupsWithSynonyms = str(round((n_groupsWithSynonyms / n_groupsOfDisorders) * 100, 2))
        percent_n_groupsAnySynonyms = str(round((n_all_groups_with_no_synonyms / n_groupsOfDisorders) * 100, 2))
        dict_results[language].append(total_number_of_synonyms)
        dict_results[language].append(n_groupsWithSynonyms)
        dict_results[language].append(n_all_groups_with_no_synonyms)
        dict_results[language].append(percent_n_groupsWithSynonyms)
        dict_results[language].append(percent_n_groupsAnySynonyms)
        dict_results[language].append(medianSyns)
        dict_results[language].append(avg_syn)
        dict_results[language].append(avg_syn_with_one_syn)
        dict_results[language].append(MaxSyns)
        dict_results[language].append('')
        dict_results[language].append('')

        ## DISORDERS
        disorders = getDisorders(active_entities)
        n_disorders = len(disorders)
        dict_results[language].append(n_disorders)
        n_all_disorders_with_no_synonyms, n_disordersWithSynonyms, total_number_of_synonyms, medianSyns, MaxSyns, avg_syn, avg_syn_with_one_syn = getSynonymsStatistics(
            disorders)
        percent_n_disordersWithSynonyms = str(round((n_disordersWithSynonyms / n_disorders) * 100, 2))
        percent_n_disordersAnySynonyms = str(round((n_all_disorders_with_no_synonyms / n_disorders) * 100, 2))
        dict_results[language].append(total_number_of_synonyms)
        dict_results[language].append(n_disordersWithSynonyms)
        dict_results[language].append(n_all_disorders_with_no_synonyms)
        dict_results[language].append(percent_n_disordersWithSynonyms)
        dict_results[language].append(percent_n_disordersAnySynonyms)
        dict_results[language].append(medianSyns)
        dict_results[language].append(avg_syn)
        dict_results[language].append(avg_syn_with_one_syn)
        dict_results[language].append(MaxSyns)
        dict_results[language].append('')
        dict_results[language].append('')

        ## SUBTYPES OF DISORDERS
        subtypesOfDisorders = getSubtypesOfDisorders(active_entities)
        n_subtypesOfDisorders = len(subtypesOfDisorders)
        dict_results[language].append(n_subtypesOfDisorders)
        n_all_subtypes_with_no_synonyms, n_subtypesWithSynonyms, total_number_of_synonyms, medianSyns, MaxSyns, avg_syn, avg_syn_with_one_syn = getSynonymsStatistics(
            subtypesOfDisorders)
        percent_n_subtypesWithSynonyms = str(round((n_subtypesWithSynonyms / n_subtypesOfDisorders) * 100, 2))
        percent_n_subtypesAnySynonyms = str(round((n_all_subtypes_with_no_synonyms / n_subtypesOfDisorders) * 100, 2))
        dict_results[language].append(total_number_of_synonyms)
        dict_results[language].append(n_subtypesWithSynonyms)
        dict_results[language].append(n_all_subtypes_with_no_synonyms)
        dict_results[language].append(percent_n_subtypesWithSynonyms)
        dict_results[language].append(percent_n_subtypesAnySynonyms)
        dict_results[language].append(medianSyns)
        dict_results[language].append(avg_syn)
        dict_results[language].append(avg_syn_with_one_syn)
        dict_results[language].append(MaxSyns)
        del (active_entities)

    myDataframe = pd.DataFrame(dict_results)
    with pd.ExcelWriter('../output_tables/NumberOfSynonymsAndNomenclatureTerms.xlsx', engine='xlsxwriter') as writer:
        myDataframe.to_excel(writer, index=False)
    display(HTML(myDataframe.to_html(index=False)))


def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :return: xml parsed as dict
    """
    with open(xmlfile, "r", encoding='ISO-8859-1') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict


def getactiveEntities(xml_data):
    """
    filter out inactive entities highlighting flag 8192
    :param xml_data:
    :return:
    """
    active_entities = []
    for entity in xml_data["JDBOR"]["DisorderList"]["Disorder"]:
        DisorderFlagList = entity['DisorderFlagList']['DisorderFlag']
        if isinstance(DisorderFlagList, dict):
            if DisorderFlagList['Value'] != '8192':
                active_entities.append(entity)
        else:
            DisorderFlags = []
            for DisorderFlag in DisorderFlagList:
                DisorderFlags.append(DisorderFlag['Value'])
            if not '8192' in DisorderFlags:
                active_entities.append(entity)
    return active_entities


def getGroupsOfDisorders(active_entities):
    """
    keeping only groups of disorders (with flag 36547)
    :param active_entities:
    :return:
    """
    disorders = []
    for entity in active_entities:
        if entity['DisorderGroup']['@id'] == '36540':
            disorders.append(entity)
    return disorders


def getDisorders(active_entities):
    """
    keeping only disorders (with flag 36547)
    :param active_entities:
    :return:
    """
    disorders = []
    for entity in active_entities:
        if entity['DisorderGroup']['@id'] == '36547':
            disorders.append(entity)
    return disorders


def getSubtypesOfDisorders(active_entities):
    """
    keeping only subtypes of disorders (with flag 36547)
    :param active_entities:
    :return:
    """
    disorders = []
    for entity in active_entities:
        if entity['DisorderGroup']['@id'] == '36554':
            disorders.append(entity)
    return disorders


def getSynonymsStatistics(entities):
    """
    Generic method to get statistics on synonyms
    :param entities:
    :return:
    """
    list_n_synonyms = []
    n_entities_with_no_synonyms = 0
    for entity in entities:
        if entity['SynonymList']['@count'] != '0':
            list_n_synonyms.append(int(entity['SynonymList']['@count']))
        else:
            n_entities_with_no_synonyms += 1

    synonym_array = np.array(list_n_synonyms)
    total_number_of_synonyms = 0
    for n_synonym in list_n_synonyms:
        total_number_of_synonyms += n_synonym
    avg_syn = round((total_number_of_synonyms / len(entities)), 2)
    avg_syn_with_one_syn = round((total_number_of_synonyms / len(list_n_synonyms)), 2)
    return n_entities_with_no_synonyms, len(list_n_synonyms), total_number_of_synonyms, int(
        np.median(synonym_array)), np.max(synonym_array), avg_syn, avg_syn_with_one_syn


if __name__ == '__main__':
    runProcess()
