In [None]:
import collections
import pandas as pd
import xmltodict
from tqdm import tqdm
from IPython.display import display, HTML

def runProcess():
    print('--> Computing data ...')

    ## SELECT ACTIVES ENTITIES
    xml_data_cross = getData('../xml/en_product1.xml')
    actives_entities_cross = getActivesEntities(xml_data_cross)
    n_actives_entities_cross = len(actives_entities_cross)
    # print(n_actives_entities_cross)
    ## SELECT DISORDERS
    disorders = getDisorders(actives_entities_cross)
    n_disorders = len(disorders)
    # print(n_disorders)
    ## SELECT GENETIC DISORDERS
    disorders_genetic_data = getData('../xml/en_product3_156.xml')
    genetic_disorders_orpha = filterGeneticDisorders(disorders_genetic_data)
    n_genetic_disorders_orpha = len(genetic_disorders_orpha)
    # print(n_genetic_disorders_orpha)
    dict_results={}
    dict_results['Actives Orphanet clinical entities'] = [n_actives_entities_cross]
    dict_results['Actives Orphanet disorders'] = [n_disorders]
    dict_results['Actives Orphanet genetic disorders'] = [n_genetic_disorders_orpha]
    dict_results['% Actives Orphanet genetic disorders'] = [str(round(((n_genetic_disorders_orpha*100)/n_disorders),2))]
    myDataframe = pd.DataFrame(dict_results)
    display(HTML(myDataframe.to_html(index=False)))
    with pd.ExcelWriter('../output_tables/GeneticDisorders.xlsx', engine='xlsxwriter') as writer:
         myDataframe.to_excel(writer,index=False)

def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :return: xml parsed as dict
    """
    with open(xmlfile, "r", encoding='ISO-8859-1') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict


def getActivesEntities(xml_data):
    """
    filter out inactive entities highlighting flag 8192
    :param xml_data:
    :return:
    """
    actives_entities = []
    for entity in tqdm(xml_data["JDBOR"]["DisorderList"]["Disorder"]):
        DisorderFlagList = entity['DisorderFlagList']['DisorderFlag']
        if isinstance(DisorderFlagList, dict):
            if DisorderFlagList['Value'] != '8192':
                actives_entities.append(entity)
        else:
            DisorderFlags = []
            for DisorderFlag in DisorderFlagList:
                DisorderFlags.append(DisorderFlag['Value'])
            if not '8192' in DisorderFlags:
                actives_entities.append(entity)
    return actives_entities


def getDisorders(actives_entities):
    """
    keeping only disorders (with flag 36547)
    :param actives_entities:
    :return:
    """
    disorders = []
    for entity in actives_entities:
        if entity['DisorderGroup']['@id'] == '36547':
            disorders.append(entity)
    return disorders


class ClassifLineDict(collections.OrderedDict):
    """
    Instantiate an ordered dict that will be used to keep the order of the column in the EXCEL file for classif sheet
    """

    def __init__(self):
        super(ClassifLineDict, self).__init__()
        self["ORPHAcode"] = 0
        self["OrphId"] = 0
        self["Label"] = ""
        self["DisorderType"] = ""
        self["DisorderTypeId"] = ""


def recursive_explore(tree, depth):
    """
    Will explore the classification to fill the global list_classif_out with ClassifLineDict() for each disorder
    :param tree: current disorder
    :param depth: int, depth in the classification tree
    :return: None
    """
    if isinstance(tree, dict):
        leaf = ClassifLineDict()
        leaf["ORPHAcode"] = int(tree["Disorder"]["OrphaCode"])
        leaf["OrphId"] = int(tree["Disorder"]["@id"])
        leaf["Label"] = tree["Disorder"]["Name"]["#text"]
        if "DisorderType" in tree["Disorder"].keys():
            leaf["DisorderType"] = tree["Disorder"]["DisorderType"]["Name"]["#text"]
            leaf["DisorderTypeId"] = tree["Disorder"]["DisorderType"]["@id"]
        list_classif_out.append(leaf)
        if tree["ClassificationNodeChildList"]["@count"] != "0":
            if  isinstance(tree["ClassificationNodeChildList"]["ClassificationNode"], list):
                for node in tree["ClassificationNodeChildList"]["ClassificationNode"]:
                    if "@" in node:
                        pass
                    recursive_explore(node, depth + 1)
            else:
                recursive_explore(tree["ClassificationNodeChildList"]["ClassificationNode"], depth + 1)

def explore_classif(xml_dict):
    """
    Will explore the classification xml to fill ClassifLineDict() for each disorder
    :param xml_dict:
    :return: list_classif_out = [ClassifLineDict(), ...]
    """
    root = xml_dict["JDBOR"]["ClassificationList"]["Classification"]
    classif_head = ClassifLineDict()
    classif_head["ORPHAcode"] = int(root["OrphaNumber"])
    classif_head["OrphId"] = int(root["@id"])
    classif_head["Label"] = root["Name"]["#text"]
    global list_classif_out
    list_classif_out = [classif_head, ]
    tree = root["ClassificationNodeRootList"]["ClassificationNode"]
    recursive_explore(tree, depth=0)
    return list_classif_out

def filterGeneticDisorders(disorders_genetic_data):
    genetic_classification_tree = []
    genetic_disorders = []
    ListDisorderTypeId = ['21429', '21401', '21415', '21394', '21422', '21408']
    genetic_classification_tree.extend(explore_classif(disorders_genetic_data))
    for entity in genetic_classification_tree:
        if entity['DisorderTypeId'] in ListDisorderTypeId:
            genetic_disorders.append(entity['ORPHAcode'])
    return set(genetic_disorders)


def generateMatrixResults(dict_preferential_parents, dict_filtered_preferential_parents):
    dict_results = {}
    dict_results['Preferential parent'] = []
    dict_results['Disorders'] = []
    for preferential_parent_orpha , list_orpha_disorders  in tqdm(dict_filtered_preferential_parents.items()):
        dict_results['Preferential parent'].append(dict_preferential_parents[preferential_parent_orpha])
        dict_results['Disorders'].append(len(list_orpha_disorders))
    return dict_results

if __name__ == '__main__':
    runProcess()