In [1]:
import xml.etree.ElementTree as ET
import json
import pandas as pd
import zipfile
import os
import string
import re
from tqdm import tqdm

def strip_spaces(myString):
    _RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
    _RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
    myString = _RE_COMBINE_WHITESPACE.sub(" ", myString)
    myString = _RE_STRIP_WHITESPACE.sub("", myString)
    return myString

def unzip_file(zip_path, extract_to_folder):
    if not os.path.isfile(zip_path):
        raise FileNotFoundError(f"The file {zip_path} does not exist.")
    os.makedirs(extract_to_folder, exist_ok=True) 
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_folder)
       #print(f"Extracted all contents to {extract_to_folder}")

def extract_active_ingredient(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = {'fda': 'urn:hl7-org:v3'}
    active_ingredients = []
    for ingredient in root.findall(".//fda:activeMoiety/fda:name", ns):
        active_ingredients.append(ingredient.text)
    return active_ingredients
        
def getcontraindications(xmlfilepath):
    tree = ET.parse(xmlfilepath)
    root = tree.getroot()
    ns = {'hl7': 'urn:hl7-org:v3'}
    sections = root.findall('.//hl7:section', namespaces=ns)
    for section in sections:
        codeSection = section.find('.//hl7:code', namespaces=ns)
        code = codeSection.get('code') if codeSection is not None else "no code"
        #if code == "43685-7":
        if code == "34070-3":
            text_elem = section.find('.//hl7:text', namespaces=ns)
            try:
                text_content = ''.join(text_elem.itertext()).strip()
            except:
                print('text_elem was empty')
                return ""
            return strip_spaces(text_content.strip(string.whitespace.replace(" ", "")))
        else:
            text_elem = None    
    return None

def get_contraindications_codes(xmlfilepath):
    #print("Finding contraindications for ", xmlfilepath)
    tree = ET.parse(xmlfilepath)
    root = tree.getroot()
    ns = {'hl7': 'urn:hl7-org:v3'}
    sections = root.findall('.//hl7:code', namespaces=ns)
    for code in sections:
        print(code.get('code'))

################################################################
## MAIN STARTS HERE ############################################
################################################################

#TODO: write this into __main__, take location of labels as arg
dir = "/Volumes/MML/dailymed_labels/"


contraindicationsList = []
ingredientsList = []
counts = 0
foundCounts = 0
notFoundCounts = 0

dirs = []

# TODO: automatically find, download, unzip all of the dailymed folders
labelFolders = ["prescription_1/", "prescription_2/", "prescription_3/", "prescription_4/", "prescription_5/"]

for label in labelFolders:
    dirs.append(dir+label)



for directory in dirs:
    print(f"reading directory {directory}")
    for files in tqdm(os.listdir(directory), total = len(os.listdir(directory))):
        if files.endswith(".zip") and "._" not in files:
            fpath = directory + files
            fileRoot = files.replace(".zip","")
            dest = directory + fileRoot
            #try:
                #unzip_file(fpath,dest)
            #except:
                #print("failed to unzip file ", fpath)
                #continue
            xmlfile=""
            for contents in os.listdir(dest):
                if contents.endswith(".xml") and "~$" not in contents:
                    xmlfile=contents.replace("._","")
            xmlfilepath = dest+"/"+xmlfile.replace("~$","")
            #print(xmlfilepath)
            contraindications = getcontraindications(xmlfilepath)
            active_ingredients = extract_active_ingredient(xmlfilepath)
            for ind, item in enumerate(active_ingredients):
                active_ingredients[ind]=item.upper()
            ingredientsList.append(set(active_ingredients))
            if contraindications is not None:
                contraindicationsList.append(contraindications)
                foundCounts += 1
                #print(foundCounts, " contraindications successfully found so far")
            else:
                notFoundCounts += 1
                #print(notFoundCounts, " contraindications not found so far, failed to find for ", files)
                contraindicationsList.append("")
            counts +=1
    
print("finished ingesting contraindications")
data = pd.DataFrame({'active ingredient':ingredientsList, 'contraindications':contraindicationsList})
data.to_excel("fda_label_contraindications_sections.xlsx")

reading directory /Volumes/MML/dailymed_labels/prescription_1/


  9%|███▎                                  | 5149/59572 [01:02<13:32, 66.94it/s]

text_elem was empty


 10%|███▋                                  | 5759/59572 [01:10<10:34, 84.83it/s]

text_elem was empty


 14%|█████▏                                | 8228/59572 [01:42<10:41, 79.99it/s]

text_elem was empty


 18%|██████▋                              | 10860/59572 [02:14<15:47, 51.41it/s]

text_elem was empty


 20%|███████▏                            | 11943/59572 [02:30<06:40, 119.04it/s]

text_elem was empty
text_elem was empty


100%|████████████████████████████████████| 59572/59572 [03:10<00:00, 312.99it/s]


reading directory /Volumes/MML/dailymed_labels/prescription_2/


  5%|██                                   | 2336/42720 [00:35<05:25, 123.89it/s]

text_elem was empty
text_elem was empty


 14%|█████▏                                | 5814/42720 [01:26<09:07, 67.42it/s]

text_elem was empty


 15%|█████▊                                | 6518/42720 [01:36<07:56, 76.04it/s]

text_elem was empty


 19%|███████▏                              | 8096/42720 [02:00<10:48, 53.41it/s]

text_elem was empty


 25%|█████████                            | 10472/42720 [02:36<08:49, 60.88it/s]

text_elem was empty


100%|████████████████████████████████████| 42720/42720 [02:39<00:00, 268.40it/s]


reading directory /Volumes/MML/dailymed_labels/prescription_3/


  2%|▋                                      | 696/39684 [00:10<11:23, 57.01it/s]

text_elem was empty


  4%|█▎                                    | 1400/39684 [00:20<07:49, 81.63it/s]

text_elem was empty


  4%|█▍                                    | 1545/39684 [00:23<11:26, 55.53it/s]

text_elem was empty


  6%|██                                    | 2209/39684 [00:34<12:03, 51.78it/s]

text_elem was empty


  6%|██▎                                   | 2387/39684 [00:37<11:22, 54.64it/s]

text_elem was empty


  7%|██▌                                   | 2645/39684 [00:41<10:21, 59.57it/s]

text_elem was empty


  8%|███▏                                  | 3357/39684 [00:53<08:57, 67.60it/s]

text_elem was empty


 19%|███████▎                              | 7610/39684 [01:55<07:54, 67.57it/s]

text_elem was empty


 21%|████████▏                             | 8517/39684 [02:10<08:50, 58.77it/s]

text_elem was empty


 23%|████████▊                             | 9153/39684 [02:20<10:11, 49.92it/s]

text_elem was empty


100%|████████████████████████████████████| 39684/39684 [02:33<00:00, 259.03it/s]


reading directory /Volumes/MML/dailymed_labels/prescription_4/


  2%|▌                                      | 570/37760 [00:08<08:54, 69.58it/s]

text_elem was empty


  5%|██                                    | 2070/37760 [00:31<08:26, 70.45it/s]

text_elem was empty


 14%|█████▎                                | 5268/37760 [01:26<07:48, 69.38it/s]

text_elem was empty


100%|████████████████████████████████████| 37760/37760 [02:41<00:00, 233.61it/s]


reading directory /Volumes/MML/dailymed_labels/prescription_5/


  7%|██▊                                   | 1615/21776 [00:29<04:56, 68.05it/s]

text_elem was empty


  8%|███▏                                  | 1819/21776 [00:33<04:34, 72.64it/s]

text_elem was empty


 10%|███▋                                  | 2098/21776 [00:38<06:02, 54.31it/s]

text_elem was empty


 14%|█████▎                                | 3037/21776 [00:55<05:30, 56.70it/s]

text_elem was empty


 15%|█████▊                                | 3307/21776 [01:00<05:01, 61.17it/s]

text_elem was empty


100%|████████████████████████████████████| 21776/21776 [01:39<00:00, 219.23it/s]


finished ingesting contraindications
