In [4]:
import pandas as pd
import xml.etree.ElementTree as ET
import os

# Define the path to your folder containing the XML files
folder_path = r'C:\Users\dsmiley\Documents\GitHub\iip-texts\archival-files'

# Define namespaces if needed for parsing XML
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

# Initialize an empty list to hold data from all files
all_data = []

# Loop through each file in the directory
for filename in os.listdir(folder_path):
    if filename.endswith('.xml'):  # Ensure we're only processing XML files
        file_path = os.path.join(folder_path, filename)

        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Initialize dictionary for class declarations
        class_decl = {}

        # Process taxonomy information
        for taxonomy in root.findall('.//tei:classDecl/tei:taxonomy', namespaces):
            taxonomy_id = taxonomy.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
            for category in taxonomy.findall('.//tei:category', namespaces):
                category_id = category.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
                # Assign the category's xml:id as the value for the taxonomy's xml:id column
                class_decl[taxonomy_id] = category_id

        # Extract other necessary data as before
        idno_element = root.find('.//tei:publicationStmt/tei:idno', namespaces)
        idno = idno_element.text.strip() if idno_element is not None and idno_element.text is not None else 'N/A'
        origin_date = 'N/A'  # Initialize origin_date
        place_name = 'N/A'  # Initialize place_name
        region_name = 'N/A'
        geog_name = 'N/A'
        
        # Initialize default values for text and translation
        text, translation = 'N/A', 'N/A'

        # Find the <body> element
        body_element = root.find('.//tei:body', namespaces)

        if body_element is not None:
            # Extract text from <div> with type "edition"
            edition_div = body_element.find('.//tei:div[@type="edition"]', namespaces)
            if edition_div is not None:
                text = ''.join(edition_div.itertext()).strip()

            # Extract translation from <div> with type "translation"
            translation_div = body_element.find('.//tei:div[@type="translation"]', namespaces)
            if translation_div is not None:
                translation = ''.join(translation_div.itertext()).strip()
        
       # Inside the loop, after parsing the XML file
        origin_element = root.find('.//tei:origin', namespaces)
        if origin_element is not None:
            date_element = origin_element.find('.//tei:date', namespaces)
            period = 'N/A'  # Default if period attribute is missing
            if date_element is not None:
                notBefore = date_element.attrib.get('notBefore', 'Unknown')
                notAfter = date_element.attrib.get('notAfter', 'Unknown')
                period = date_element.attrib.get('period', 'N/A')  # Extract the period attribute
                origin_date = f"{notBefore} - {notAfter}"
            else:
                origin_date = 'Unknown'

            place_name_element = origin_element.find('.//tei:placeName/tei:settlement', namespaces)
            place_name = place_name_element.text.strip() if place_name_element is not None and place_name_element.text is not None else 'N/A'

            region_name_element = origin_element.find('.//tei:placeName/tei:region', namespaces)
            region_name = region_name_element.text.strip() if region_name_element is not None and region_name_element.text is not None else 'N/A'

            geog_name_element = origin_element.find('.//tei:placeName/tei:geogName', namespaces)
            geog_name = geog_name_element.text.strip() if geog_name_element is not None and geog_name_element.text is not None else 'N/A'
        else:
            origin_date, period, place_name, region_name, geog_name = 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

        # When adding to all_data list
        all_data.append({
            'ID': idno,
            'Origin Date': origin_date,
            'Period': period,  # Assuming you're also extracting the period as before
            'Place Name': place_name,
            'Region Name': region_name,
            'Geographic Name': geog_name,
            **class_decl,  # Assuming you're also extracting taxonomy information as before
            'Text': text,
            'Translation': translation  # Add the new Translation column
        })

# Create the DataFrame from the aggregated data
df = pd.DataFrame(all_data)

# Optionally, save the DataFrame to a CSV file
# df.to_csv('xml_data_aggregated.csv', index=False)

# Show the DataFrame structure
df.head()  # Display the first few rows to check

Unnamed: 0,ID,Origin Date,Period,Place Name,Region Name,Geographic Name,IIP-genre,IIP-religion,Text,Translation,IIP-form,IIP-materials,IIP-preservation,IIP-writing
0,abil0001,0070 - 0500,Talmudic,Abilene,Golan,synagogue,dedicatory,jewish,דכיר לט\n \n בר\n ...,Remember for good Baruch [?] of Alexandria [?]...,,,,
1,abur0001,0300 - 0700,Talmudic,Bethennim,Judaea,Church complex,invocation,christian,ΚΕ ΙΥ ΧΕ ΜΝΗΣΘΗΤΙΤΟΥ ΔΟΥΛΟΥ ΣΟΥ ΑΛΑ \n ...,Lord Jesus Christ [remember your servant ...] ...,mosaic,,,
2,ahma0001,0200 - 0450,Talmudic,Ahmadiyye,Golan,,text_other,jewish,לא \n תמוש משמר\n ...,[You shall not] stray from observing [his laws...,architrave,,,
3,ahma0002,0250 - 0399,Talmudic,Ahmadiyye,Golan,,text_unknown,jewish,ΣΙΜΩΝ\n ΙΟΥΣΤΙΝΟΣ,Simon Justinus,lintel,stone,fragment.single,
4,ahma0003,0250 - 0399,http://n2t.net/ark:/99152/p0m63njtmv8,Ahmadiyye,Golan,,place_marker.boundary,jewish,ΛΙΘΟΝΔΙΟ\n ΡΙΖΟΝΤΑΟΡ\n ...,...a stone demarking (the) boundaries(?) of....,boundary_marker,,,


In [5]:
df = df.replace('\n', '', regex=True)

In [None]:
#df["Period"].unique()
df["Period"].count_values()