# Covert ILCD HTML files to CSV

In [61]:
import os
from bs4 import BeautifulSoup
import pandas as pd

def parse_table_with_rowspan(table):
    """
    Parse an HTML table into a grid (list of lists) taking into account cells
    with rowspan (and colspan). For each cell with a rowspan, its value is
    inserted in subsequent rows.
    """
    rows = table.find_all("tr")
    grid = []
    spanning = {}  # key: (row_index, col_index) -> cell text
    for i, row in enumerate(rows):
        cells = []
        col = 0
        # Process each cell (td or th)
        for cell in row.find_all(["td", "th"]):
            # Fill in any spanning cells that belong in this row and position
            while (i, col) in spanning:
                cells.append(spanning[(i, col)])
                del spanning[(i, col)]
                col += 1
            # Get cell text (cleaned up)
            cell_text = " ".join(cell.get_text(strip=True).split())
            # Determine rowspan and colspan (default 1)
            try:
                rowspan = int(cell.get("rowspan", "1"))
            except:
                rowspan = 1
            try:
                colspan = int(cell.get("colspan", "1"))
            except:
                colspan = 1

            # Add current cell text for current cell and for additional columns (if any)
            cells.append(cell_text)
            # Mark spanning cells for subsequent rows
            if rowspan > 1:
                for r in range(1, rowspan):
                    for c in range(col, col + colspan):
                        spanning[(i + r, c)] = cell_text
            col += colspan
        # If there are still spanning cells for positions at the end, add them.
        while (i, col) in spanning:
            cells.append(spanning[(i, col)])
            del spanning[(i, col)]
            col += 1
        grid.append(cells)
    return grid

# Directories for HTML and CSV files
html_directory = "../data/pipeline1/html"
output_directory = "../data/pipeline1/csv"
os.makedirs(output_directory, exist_ok=True)

# Mapping of inconsistent headers to standardized headers (if needed)
header_mapping = {
    "Field name": "Field Name (en)",
    "Field name (de)": "Field Name (de)",
    "Field name (en)": "Field Name (en)",
    "Element/Attribute name": "Element/Attribute Name",
    "Element/attribute name": "Element/Attribute Name",
    "Data type": "Datatype",
    "Definition": "Definition (en)",
}

for file_name in os.listdir(html_directory):
    if file_name.endswith(".html"):
        html_file_path = os.path.join(html_directory, file_name)
        csv_file_name = os.path.splitext(file_name)[0] + ".csv"
        csv_output_path = os.path.join(output_directory, csv_file_name)

        print(f"Processing: {file_name}")

        # Load the HTML content
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Special handling for ILCD_Common_EnumerationValues.html
        if file_name == "ILCD_Common_EnumerationValues.html":
            # For this file, we assume the table to process is the first table
            table = soup.find("table")
            if not table:
                print(f"No table found in {file_name}. Skipping.")
                continue
            # Parse the table into a grid (list of rows with expanded rowspans)
            grid = parse_table_with_rowspan(table)
            # Assume the first row is the header row
            headers = grid[0]
            # Apply header mapping if available
            headers = [header_mapping.get(h, h) for h in headers]
            data_rows = grid[1:]
            df = pd.DataFrame(data_rows, columns=headers)
        else:
            # For all other files, use your existing logic
            table = soup.find("table", {"id": "tableID"})
            if not table:
                print(f"No table with id 'tableID' found in {file_name}. Skipping.")
                continue

            headers = [
                header_mapping.get(th.get_text(strip=True), th.get_text(strip=True))
                for th in table.find_all("th")
            ]
            rows = []
            for tr in table.find_all("tr"):
                row = []
                for td in tr.find_all("td"):
                    # Remove elements with class "info"
                    for info_element in td.find_all(class_="info"):
                        info_element.extract()
                    # If an <a> tag exists, extract only its text; otherwise, full cell text
                    first_link = td.find("a")
                    if first_link:
                        cleaned_text = first_link.get_text(strip=True)
                    else:
                        cleaned_text = " ".join(td.get_text(strip=True).split())
                    row.append(cleaned_text)
                rows.append(row)

            max_columns = max(len(row) for row in rows)
            adjusted_rows = [row + [""] * (max_columns - len(row)) for row in rows]
            adjusted_headers = headers + [""] * (max_columns - len(headers))
            df = pd.DataFrame(adjusted_rows, columns=adjusted_headers)

        # Clean empty cells (whitespace only becomes NaN), drop empty rows/columns
        df.replace(r"^\s*$", pd.NA, regex=True, inplace=True)
        df = df.dropna(axis=1, how="all")
        df = df.dropna(axis=0, how="all")

        # Save the DataFrame to CSV
        df.to_csv(csv_output_path, index=False, encoding="utf-8")
        print(f"Saved: {csv_output_path}")

print("Processing complete.")


Processing: EPD_DataSet.html
Headers: ['Field Name (de)', 'Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (de)', 'Definition (en)', 'Original ILCD Format Definition (en)', 'eDoc ID']
Saved: ../data/pipeline1/csv\EPD_DataSet.csv
Processing: EPD_FlowDataSet.html
Headers: ['Field Name (de)', 'Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (de)', 'Definition (en)', 'Original ILCD Format Definition (en)', 'eDoc ID']
Saved: ../data/pipeline1/csv\EPD_FlowDataSet.csv
Processing: ILCD_Common_DataTypes.html
Headers: ['Data Type', 'Base Type', 'Description']
Saved: ../data/pipeline1/csv\ILCD_Common_DataTypes.csv
Processing: ILCD_Common_EnumerationValues.html
Headers: ['List name', 'Value', 'Definition (en)']
Saved: ../data/pipeline1/csv\ILCD_Common_EnumerationValues.csv
Processing: ILCD_ContactDataSet.html
Headers: ['Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (en)', 'eDoc ID']
Sav