In [None]:
import prototype_2.metadata as meta_data
import re
import pandas as pd
from foundry.transforms import Dataset

"""
    Scrapes paths from prototype_2/metadata python files.
    Creates them in the style of vocab_snooper.py or its output
    vocab_discovered_codes_expanded.
    
    Exports to mapped_xml_paths dataset in HDFS. 

"""    

def get_paths(meta_dict, dedupe= False):
    path_list = set() if dedupe else []
    cleaner_re = r'\[.*\]'
    hl7_cleaner_re = r'hl7:'
    for domain_key, domain_dict in meta_dict.items():
        if 'root' not in domain_dict:
            print(f"NO ROOT for domain? {domain_key}")
            continue
        root = domain_dict['root']
        if 'element' not in root:
            print(f"NO ROOT in root for {domain_key}")
            continue
        root_path = root["element"]        
        if not root_path.endswith("/"):
                root_path += "/"        

        for field_key, field_dict in domain_dict.items():
            if field_key == "root" or 'element' not in field_dict:
                continue

            new_path = f"{root_path}{field_dict['element']}"

            if 'attribute' in field_dict:
                attribute = field_dict['attribute']
                
                # Fix specific attribute names
                if attribute == "codecode":
                    attribute = "@code"
                elif attribute == "codecodesystem":
                    attribute = "@codeSystem"
                else:
                    attribute = f"@{attribute}"

                new_path += f"/{attribute}"
                new_cleaned_path = re.sub(cleaner_re, '', new_path)
                new_cleaned_path = re.sub(hl7_cleaner_re, '', new_cleaned_path)
                    
                if dedupe:
                    path_list.add(new_cleaned_path)
                else:
                    path_list.append(new_cleaned_path)

    return sorted(path_list) if dedupe else path_list

def main():
    meta_dict = meta_data.get_meta_dict()
    paths = get_paths(meta_dict)
    columns=['columns']
    mapped_xml_paths_df = pd.DataFrame(columns=columns)
    for path in paths:
        mapped_xml_paths_df = pd.concat(
            [pd.DataFrame([[path]], columns=columns), mapped_xml_paths_df], 
            ignore_index=True)
    print(mapped_xml_paths_df)
    
    # export to HDFS
    mapped_xml_paths = Dataset.get("mapped_xml_paths")
    mapped_xml_paths.write_table(mapped_xml_paths_df)       

if __name__ == '__main__':
    main()