In [None]:
import os
import collections
import pandas as pd
import xmltodict
import configuration
from IPython.display import display, HTML
from tqdm import tqdm
from collections import OrderedDict

def runProcess():
    dict_results={}
    dict_results['Orphanet classification name'] = []
    dict_results['Number of active disorders included in the hierarchy'] = []
    list_of_files = []
    working_path = '../xml/'
    os.chdir(working_path)
    for xmlfile in os.listdir(os.getcwd()):
        if xmlfile.find('_product3_') != -1 and xmlfile.find('_product3_235') == -1:
            list_of_files.append(xmlfile)
    MedicalSpecialties = {}
    for xmlfile in tqdm(list_of_files):
        full_classif_list = []
        path_in = working_path + '/' + xmlfile
        xml_dict = getData(path_in)
        full_classif_list.extend(explore_classif(xml_dict))
        ListDisorderTypeId = ['21429', '21401', '21415', '21394', '21422', '21408']
        MedicalSpecialtyOrphacodes = []
        for classif in full_classif_list:
            if classif['DisorderTypeId'] in ListDisorderTypeId:
                MedicalSpecialtyOrphacodes.append(classif["ORPHAcode"])
        MedicalSpecialtyOrphacodes = set(MedicalSpecialtyOrphacodes)
        MedicalSpecialties[full_classif_list[0]['Label']] = len(MedicalSpecialtyOrphacodes)
    MedicalSpecialties = OrderedDict(sorted(MedicalSpecialties.items()))
    for medicalSpecialty in MedicalSpecialties:
        dict_results['Orphanet classification name'].append(medicalSpecialty)
        dict_results['Number of active disorders included in the hierarchy'].append(MedicalSpecialties[medicalSpecialty])
    myDataframe = pd.DataFrame(dict_results)
    myDataframe = myDataframe.sort_values(by=['Number of active disorders included in the hierarchy'], ascending=False)
    with pd.ExcelWriter('../output_tables/MedicalSpecialties.xlsx', engine='xlsxwriter') as writer:
        myDataframe.to_excel(writer,index=False)
    display(HTML(myDataframe.to_html(index=False)))


def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :return: xml parsed as dict
    """
    with open(xmlfile, "r", encoding='ISO-8859-1') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict



class ClassifLineDict(collections.OrderedDict):
    """
    Instantiate an ordered dict that will be used to keep the order of the column in the EXCEL file for classif sheet
    """

    def __init__(self):
        super(ClassifLineDict, self).__init__()
        self["ORPHAcode"] = 0
        self["OrphId"] = 0
        self["Label"] = ""
        self["DisorderType"] = ""
        self["DisorderTypeId"] = ""


def recursive_explore(tree, depth):
    """
    Will explore the classification to fill the global list_classif_out with ClassifLineDict() for each disorder
    :param tree: current disorder
    :param depth: int, depth in the classification tree
    :return: None
    """
    if isinstance(tree, dict):
        leaf = ClassifLineDict()
        leaf["ORPHAcode"] = int(tree["Disorder"]["OrphaCode"])
        leaf["OrphId"] = int(tree["Disorder"]["@id"])
        leaf["Label"] = tree["Disorder"]["Name"]["#text"]
        if "DisorderType" in tree["Disorder"].keys():
            leaf["DisorderType"] = tree["Disorder"]["DisorderType"]["Name"]["#text"]
            leaf["DisorderTypeId"] = tree["Disorder"]["DisorderType"]["@id"]
        list_classif_out.append(leaf)
        if tree["ClassificationNodeChildList"]["@count"] != "0":
            if  isinstance(tree["ClassificationNodeChildList"]["ClassificationNode"], list):
                for node in tree["ClassificationNodeChildList"]["ClassificationNode"]:
                    if "@" in node:
                        pass
                    recursive_explore(node, depth + 1)
            else:
                recursive_explore(tree["ClassificationNodeChildList"]["ClassificationNode"], depth + 1)

def explore_classif(xml_dict):
    """
    Will explore the classification xml to fill ClassifLineDict() for each disorder
    :param xml_dict:
    :return: list_classif_out = [ClassifLineDict(), ...]
    """
    root = xml_dict["JDBOR"]["ClassificationList"]["Classification"]
    classif_head = ClassifLineDict()
    classif_head["ORPHAcode"] = int(root["OrphaNumber"])
    classif_head["OrphId"] = int(root["@id"])
    classif_head["Label"] = root["Name"]["#text"]
    global list_classif_out
    list_classif_out = [classif_head, ]
    tree = root["ClassificationNodeRootList"]["ClassificationNode"]
    recursive_explore(tree, depth=0)
    return list_classif_out


if __name__ == '__main__':

    runProcess()