In [None]:
import xml.etree.ElementTree as ET
import json
import pandas as pd
import zipfile
import os
import string
import re

def strip_spaces(myString):
    _RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
    _RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
    myString = _RE_COMBINE_WHITESPACE.sub(" ", myString)
    myString = _RE_STRIP_WHITESPACE.sub("", myString)
    return myString

def unzip_file(zip_path, extract_to_folder):
    if not os.path.isfile(zip_path):
        raise FileNotFoundError(f"The file {zip_path} does not exist.")
    os.makedirs(extract_to_folder, exist_ok=True) 
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_folder)
       #print(f"Extracted all contents to {extract_to_folder}")

def extract_active_ingredient(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = {'fda': 'urn:hl7-org:v3'}
    active_ingredients = []
    for ingredient in root.findall(".//fda:activeMoiety/fda:name", ns):
        active_ingredients.append(ingredient.text)
    return active_ingredients
        
def getcontraindications(xmlfilepath):
    tree = ET.parse(xmlfilepath)
    root = tree.getroot()
    ns = {'hl7': 'urn:hl7-org:v3'}
    sections = root.findall('.//hl7:section', namespaces=ns)
    for section in sections:
        codeSection = section.find('.//hl7:code', namespaces=ns)
        code = codeSection.get('code') if codeSection is not None else "no code"
        if code == "34070-3":
            text_elem = section.find('.//hl7:text', namespaces=ns)
            try:
                text_content = ''.join(text_elem.itertext()).strip()
            except:
                print('text_elem was empty')
                return ""
            return strip_spaces(text_content.strip(string.whitespace.replace(" ", "")))
        else:
            text_elem = None    
    return None

def getPediatricConsiderations(xmlfilepath):
    contraindicationsNameTable = ['contraindications','contraindicationS', "contraindicationS AND USAGE", "contraindications and Usage", 'contraindicationS ', 'contraindications and usage', 'contraindicationS:', 'contraindicationS & USAGE', 'contraindicationS AND USAGE:', 'contraindicationS AND USAGE ', 'contraindicationS AND USE', '1 contraindicationS AND USAGE']
    tree = ET.parse(xmlfilepath)
    root = tree.getroot()
    ns = {'hl7': 'urn:hl7-org:v3'}
    sections = root.findall('.//hl7:section', namespaces=ns)
    for section in sections:
        codeSection = section.find('.//hl7:code', namespaces=ns)
        code = codeSection.get('code') if codeSection is not None else "no code"
        if code == "34070-3":
            text_elem = section.find('.//hl7:text', namespaces=ns)
            text_content = ''.join(text_elem.itertext()).strip()
            return strip_spaces(text_content.strip(string.whitespace.replace(" ", "")))
        else:
            text_elem = None
        
    return None
       # if title.strip().replace(":", "") in contraindicationsNameTable:
          #  return text_content

def get_contraindications_codes(xmlfilepath):
    print("Finding contraindications for ", xmlfilepath)
    tree = ET.parse(xmlfilepath)
    root = tree.getroot()
    ns = {'hl7': 'urn:hl7-org:v3'}
    sections = root.findall('.//hl7:code', namespaces=ns)
    for code in sections:
        print(code.get('code'))

################################################################
## MAIN STARTS HERE ############################################
################################################################

#TODO: write this into __main__, take location of labels as arg
dir = "/Volumes/MML/dailymed_labels/"


contraindicationsList = []
ingredientsList = []
counts = 0
foundCounts = 0
notFoundCounts = 0

dirs = []

# TODO: automatically find, download, unzip all of the dailymed folders
labelFolders = ["prescription_1/", "prescription_2/", "prescription_3/", "prescription_4/", "prescription_5/"]

for label in labelFolders:
    dirs.append(dir+label)

for directory in dirs:
    for files in os.listdir(directory):
        if files.endswith(".zip") and "._" not in files:
            fpath = directory + files
            fileRoot = files.replace(".zip","")
            dest = directory + fileRoot
            #try:
                #unzip_file(fpath,dest)
            #except:
                #print("failed to unzip file ", fpath)
                #continue
            xmlfile=""
            for contents in os.listdir(dest):
                if contents.endswith(".xml") and "~$" not in contents:
                    xmlfile=contents.replace("._","")
            xmlfilepath = dest+"/"+xmlfile.replace("~$","")
            print(xmlfilepath)
            contraindications = getcontraindications(xmlfilepath)
            active_ingredients = extract_active_ingredient(xmlfilepath)
            for ind, item in enumerate(active_ingredients):
                active_ingredients[ind]=item.upper()
            ingredientsList.append(set(active_ingredients))
            if contraindications is not None:
                contraindicationsList.append(contraindications)
                foundCounts += 1
                print(foundCounts, " contraindications successfully found so far")
            else:
                notFoundCounts += 1
                print(notFoundCounts, " contraindications not found so far, failed to find for ", files)
                contraindicationsList.append("")
            counts +=1
    
print("finished ingesting contraindications")
data = pd.DataFrame({'active ingredient':ingredientsList, 'contraindications':contraindicationsList})
data.to_excel("contraindicationList.xlsx")

/Volumes/MML/dailymed_labels/prescription_1/20060131_ABD6ECF0-DC8E-41DE-89F2-1E36ED9D6535/ABD6ECF0-DC8E-41DE-89F2-1E36ED9D6535.xml
1  contraindications successfully found so far
/Volumes/MML/dailymed_labels/prescription_1/20060131_dffb4544-0e47-40cd-9baa-d622075838cc/dffb4544-0e47-40cd-9baa-d622075838cc.xml
2  contraindications successfully found so far
/Volumes/MML/dailymed_labels/prescription_1/20060412_AAE8B7A4-742A-4BEC-A283-31B9408EE1AA/AAE8B7A4-742A-4BEC-A283-31B9408EE1AA.xml
3  contraindications successfully found so far
/Volumes/MML/dailymed_labels/prescription_1/20060525_0AAAA03E-B258-45EC-89C8-06E1353F57EF/0AAAA03E-B258-45EC-89C8-06E1353F57EF.xml
4  contraindications successfully found so far
/Volumes/MML/dailymed_labels/prescription_1/20060621_9C4F12E5-5E69-44D4-81A5-BD72C375EEF5/9C4F12E5-5E69-44D4-81A5-BD72C375EEF5.xml
5  contraindications successfully found so far
/Volumes/MML/dailymed_labels/prescription_1/20060629_09BAFC2D-1893-4618-86DC-E9403407CD41/09BAFC2D-1893-4618-8