In [7]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
import re
import json
from datetime import datetime

def parse_atc_file_function(file_path):
    """
    Parse the raw ATC from KEGG saves it as JSON with today's date.
    
    Args:
        file_path: Input file
        context: Execution context (provided by Airflow or testing)
        
    Returns:
        str: Path to the downloaded file
    """
    result = []
    
    # Dictionary to store the current hierarchy
    current = {
        'A': None,
        'B': None,
        'C': None,
        'D': None,
        'E': None
    }
    
    # Store ATC codes at different levels
    atc_codes = {
        'B': None,  # For A01
        'C': None,  # For A01A
        'D': None,  # For A01AA
    }
    
    current_e_data = None
    kegg_entries = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            
            # Skip empty lines and header lines
            if not line or line.startswith('!') or line.startswith('+F'):
                continue
            
            # Extract depth and content
            if line[0] in 'ABCDEF':
                depth = line[0]
                content = line[1:].strip()
                
                # Process based on depth
                if depth == 'A':
                    # New section - reset hierarchy
                    current = {
                        'A': content,
                        'B': None,
                        'C': None,
                        'D': None,
                        'E': None
                    }
                    
                    # Add the top-level entry (e.g., AA)
                    result.append({
                        'depth': 'A',
                        'atc': content[:2].strip(),
                        'name': content
                    })
                    
                elif depth == 'B':
                    current['B'] = content
                    
                    # Extract ATC code like A01
                    match = re.search(r'([A-Z]\d{2})\s', content)
                    if match:
                        code = match.group(1)
                        atc_codes['B'] = code
                        name = content[content.find(' ')+1:].strip()
                        
                        result.append({
                            'depth': 'B',
                            'atc': code.strip(),
                            'name': name,
                            'father': code[0]  # Just the first letter (A, B, etc.)
                        })
                    
                elif depth == 'C':
                    current['C'] = content
                    
                    # Extract ATC code like A01A
                    match = re.search(r'([A-Z]\d{2}[A-Z])\s', content)
                    if match:
                        code = match.group(1)
                        atc_codes['C'] = code
                        name = content[content.find(' ')+1:].strip()
                        
                        result.append({
                            'depth': 'C',
                            'atc': code.strip(),
                            'name': name,
                            'father': atc_codes['B']  # Parent like A01
                        })
                    
                elif depth == 'D':
                    current['D'] = content
                    
                    # Extract ATC code like A01AA
                    match = re.search(r'([A-Z]\d{2}[A-Z]{2})\s', content)
                    if match:
                        code = match.group(1)
                        atc_codes['D'] = code
                        name = content[content.find(' ')+1:].strip()
                        
                        result.append({
                            'depth': 'D',
                            'atc': code.strip(),
                            'name': name,
                            'father': atc_codes['C']  # Parent like A01A
                        })
                    
                elif depth == 'E':
                    # If we have a previous E entry that hasn't been added to result yet
                    if current_e_data and current_e_data not in result:
                        if kegg_entries:
                            current_e_data['KEGG'] = kegg_entries
                        result.append(current_e_data)
                    
                    # Parse the E line
                    atc_code_match = re.search(r'([A-Z]\d{2}[A-Z]{2}\d{2})', content)
                    if atc_code_match:
                        atc_code = atc_code_match.group(1)
                        
                        # Extract the name (everything after the ATC code and space)
                        name_start = content.find(atc_code) + len(atc_code) + 1
                        name = content[name_start:].strip()
                        
                        # Extract DG code if present
                        dg_match = re.search(r'\[DG:(\w+)\]', name)
                        dg_code = None
                        if dg_match:
                            dg_code = dg_match.group(1)
                            # Remove the DG part from the name
                            name = re.sub(r'\s*\[DG:\w+\]', '', name)
                        
                        # Extract father code (parent at D level)
                        father_code = atc_codes['D']
                        
                        current_e_data = {
                            'depth': 'E',
                            'atc': atc_code.strip(),
                            'name': name,
                            'father': father_code
                        }
                        
                        if dg_code:
                            current_e_data['DG'] = dg_code
                        
                        kegg_entries = []
                    else:
                        current_e_data = None
                        
                elif depth == 'F' and current_e_data:
                    # Extract KEGG ID and name more carefully
                    kegg_match = re.search(r'([A-Z]\d+)', content)
                    
                    if kegg_match:
                        kegg_id = kegg_match.group(1)
                        
                        # Get everything after the KEGG ID
                        kegg_id_pos = content.find(kegg_id) + len(kegg_id)
                        name = content[kegg_id_pos:].strip()
                        
                        # Replace HTML entities
                        name = name.replace('&lt;', '<').replace('&gt;', '>')
                        
                        kegg_entries.append({
                            'kegg_id': kegg_id,
                            'name': name
                        })
    
    # Add the last entry if it exists
    if current_e_data and current_e_data not in result:
        if kegg_entries:
            current_e_data['KEGG'] = kegg_entries
        result.append(current_e_data)

    return result

StatementMeta(, cec34c0c-3697-48ea-b428-aef53aec7004, 9, Finished, Available, Finished)

In [8]:


result = parse_atc_file_function(file_path = '/lakehouse/default/Files/br08303_2025-05-15.txt')

StatementMeta(, cec34c0c-3697-48ea-b428-aef53aec7004, 10, Finished, Available, Finished)

In [9]:
df = spark.createDataFrame(result)
df.head(5)


StatementMeta(, cec34c0c-3697-48ea-b428-aef53aec7004, 11, Finished, Available, Finished)

[Row(atc='A', depth='A', name='A ALIMENTARY TRACT AND METABOLISM', father=None, KEGG=None, DG=None),
 Row(atc='A01', depth='B', name='STOMATOLOGICAL PREPARATIONS', father='A', KEGG=None, DG=None),
 Row(atc='A01A', depth='C', name='STOMATOLOGICAL PREPARATIONS', father='A01', KEGG=None, DG=None),
 Row(atc='A01AA', depth='D', name='Caries prophylactic agents', father='A01A', KEGG=None, DG=None),
 Row(atc='A01AA01', depth='E', name='Sodium fluoride', father='A01AA', KEGG=[{'name': 'Sodium fluoride (JAN/USP) <JP>', 'kegg_id': 'D00943'}], DG=None)]

In [10]:
# Or, if Silver Lakehouse is attached in the UI and named 'SilverLakeHouse':
#output_path_silver = "abfss://drug_atc@onelake.dfs.fabric.microsoft.com/SilverLakeHouse.Lakehouse/Files/br08303_2025-05-15.json"

# It's generally best to write to a *Table* in the Silver Lakehouse as Delta format
df.write.format("delta").mode("overwrite").saveAsTable("SilverLakeHouse.atc_codes")

# If you must write a file:
#df.coalesce(1).write.format("json").mode("overwrite").save(output_path_silver)

print("Data successfully processed and written to Silver Lakehouse.")

StatementMeta(, cec34c0c-3697-48ea-b428-aef53aec7004, 12, Finished, Available, Finished)

Data successfully processed and written to Silver Lakehouse.
