In [None]:
import prototype_2.metadata as meta_data
import re
import pandas as pd
from foundry.transforms import Dataset

def get_paths(meta_dict):
    path_list = []

    for domain_key, domain_dict in meta_dict.items():
        root_path = domain_dict["root"]["element"]        

        if not root_path.endswith("/"):
            root_path += "/"        

        for sub_key, sub_dict in domain_dict.items():
            if sub_key == "root":
                continue

            # Check the type of field (FIELD or HASH)
            field_type = sub_dict.get("config_type", None)

            # Only process FIELD or HASH types
            if field_type in ["FIELD", "HASH"]:
                
                if "element" in sub_dict and "attribute" in sub_dict:
                    path_el = sub_dict["element"]
                    path_attr = sub_dict["attribute"]
                    new_path = f"{root_path}{path_el}@{path_attr}"
                elif "element" in sub_dict:
                    #### this is curious!!! are there such things?!
                    path_el = sub_dict["element"]
                    new_path = f"{root_path}{path_el}"
                    print(f"CURIOUS path {new_path}")
                else:
                    continue

                path_list.append(new_path)
    return path_list

def get_paths_CR(meta_dict, dedupe= False):
    """ This is a paranoid version of above because people writing this 
        might miss some things. Doesn't seem like it though.
        
        I also added some regex to remove the conditionals and make them easier to read.
        ...but also because IIRC they don't appear in the files.
    """
    path_list = set() if dedupe else []
    cleaner_re = r'\[.*\]'
    hl7_cleaner_re = r'hl7:'
    for domain_key, domain_dict in meta_dict.items():
        if 'root' not in domain_dict:
            print(f"NO ROOT for domain? {domain_key}")
            continue
        root = domain_dict['root']
        if 'element' not in root:
            print(f"NO ROOT in root for {domain_key}")
            continue
        root_path = root["element"]        
        if not root_path.endswith("/"):
                root_path += "/"        

        for field_key, field_dict in domain_dict.items():
            if field_key == "root" or 'element' not in field_dict:
                continue

            new_path = f"{root_path}{field_dict['element']}"

            if 'attribute' in field_dict:
                attribute = field_dict['attribute']
                
                # Fix specific attribute names
                if attribute == "codecode":
                    attribute = "@code"
                elif attribute == "codecodesystem":
                    attribute = "@codeSystem"
                else:
                    attribute = f"@{attribute}"

                new_path += f"/{attribute}"
                            
                        # attribute = f"@{attribute}"
                        #new_path = f"{root_path}{field_dict['element']}@{field_dict['attribute']}"
                        # vocab_snooper.py doesn't put in the @ signs
                        # new_path = f"{root_path}{field_dict['element']}{field_dict['attribute']}"
                        
                        # A point in fact, the vocab snooper doesn't put in anything but code and codeSystem attributes.
                        # This script includes any attribute including #text. They won't have concept mappings,
                        # and should be considered more deeply and carefully for the SME review table. TODO
                        
                new_cleaned_path = re.sub(cleaner_re, '', new_path)
                new_cleaned_path = re.sub(hl7_cleaner_re, '', new_cleaned_path)
                    
                if dedupe:
                    path_list.add(new_cleaned_path)
                else:
                    path_list.append(new_cleaned_path)

    return sorted(path_list) if dedupe else path_list

def main():
    meta_dict = meta_data.get_meta_dict()
    paths = get_paths_CR(meta_dict)
    columns=['columns']
    mapped_xml_paths_df = pd.DataFrame(columns=columns)
    for path in paths:
        mapped_xml_paths_df = pd.concat(
            [pd.DataFrame([[path]], columns=columns), mapped_xml_paths_df], 
            ignore_index=True)
    print(mapped_xml_paths_df)
    
    if True:
        # export to HDFS
        mapped_xml_paths = Dataset.get("mapped_xml_paths")
        mapped_xml_paths.write_table(mapped_xml_paths_df)       

if __name__ == '__main__':
    main()