In [None]:
import prototype_2.metadata as meta_data
import re
import pandas as pd
from foundry.transforms import Dataset

# This script looks through the element path for templateID/ followed by "..". This function "handle_parent_paths" processes each path by splitting it into parts (using / as the delimiter).
# It handles .. by popping the last element from the path, effectively going back one level for each .. found.

def handle_parent_paths(path):
    # Split the path by '/'
    path_parts = path.split('/')
    final_path_parts = []

    for part in path_parts:
        
        if part == "..":
            if final_path_parts:
                final_path_parts.pop()  # Go back one level
               
        else:
            final_path_parts.append(part)
            

    # Rebuild the path from the cleaned parts
    return "/".join(final_path_parts)


def clean_template_id(path):
    """Removes '/templateId' from the path while ensuring proper formatting."""
    cleaned_path = re.sub(r'/templateId(?=/|$)', '', path)  # Remove only when templateId is a segment
    return cleaned_path 


def clean_path(input_path):
    # removed conditional expressions
    cleaned_path = re.sub(r'\[\s*(?:@[^=]*=["\']?.*?["\']?|not\s*\(.*?\))\s*\]', '', input_path)
    # remove namespaces spec.s
    cleaned_path = re.sub(r'hl7:', '', cleaned_path)
    cleaned_path = handle_parent_paths(cleaned_path)
    cleaned_path = clean_template_id(cleaned_path)
    return cleaned_path
    

def get_paths(meta_dict, dedupe=True):
    path_list = set() if dedupe else []


    for domain_key, domain_dict in meta_dict.items():
        if 'root' not in domain_dict:
            print(f"NO ROOT for domain? {domain_key}")
            continue
        root = domain_dict['root']
        if 'element' not in root:
            print(f"NO ROOT in root for {domain_key}")
            continue
        root_path = root["element"]        
        if not root_path.endswith("/"):
            root_path += "/"        

        for field_key, field_dict in domain_dict.items():
            if field_key == "root" or 'element' not in field_dict:
                continue

            new_path = f"{root_path}{field_dict['element']}"

            if 'attribute' in field_dict:
                new_path += f"/@{field_dict['attribute']}"
                    
            final_path = clean_path(new_path)
            if dedupe:
                path_list.add(final_path)
            else:
                path_list.append(final_path)

    return sorted(path_list) if dedupe else path_list

def main():
    meta_dict = meta_data.get_meta_dict()
    paths = get_paths(meta_dict)
    columns=['columns']
    mapped_xml_paths_df = pd.DataFrame(columns=columns)

    for path in paths:
        mapped_xml_paths_df = pd.concat(
            [pd.DataFrame([[path]], columns=columns), mapped_xml_paths_df], 
            ignore_index=True)

    print(mapped_xml_paths_df)
    
    # Export to HDFS
    mapped_xml_paths = Dataset.get("mapped_xml_paths")
    mapped_xml_paths.write_table(mapped_xml_paths_df)

if __name__ == '__main__':
    main()
