# Covert ILCD HTML files to CSV

In [55]:
import os
from bs4 import BeautifulSoup
import pandas as pd

# Specify the directory containing the HTML files
html_directory = "../data/pipeline1/html"
output_directory = "../data/pipeline1/csv"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Mapping of inconsistent headers to standardized headers
header_mapping = {
    "Field name": "Field Name (en)",
    "Field name (de)": "Field Name (de)",
    "Field name (en)": "Field Name (en)",
    "Element/Attribute name": "Element/Attribute Name",
    "Element/attribute name": "Element/Attribute Name",
    "Data type": "Datatype",
    "Definition": "Definition (en)",
}

# Loop through all HTML files in the directory
for file_name in os.listdir(html_directory):
    if file_name.endswith(".html"):
        html_file_path = os.path.join(html_directory, file_name)
        csv_file_name = os.path.splitext(file_name)[0] + ".csv"
        csv_output_path = os.path.join(output_directory, csv_file_name)

        print(f"Processing: {file_name}")

        # Load the HTML content
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Find the table by its ID (adjust if necessary for your files)
        table = soup.find("table", {"id": "tableID"})
        if not table:
            print(f"No table with id 'tableID' found in {file_name}. Skipping.")
            continue

        # Extract headers and rows from the table
        headers = [
            header_mapping.get(th.get_text(strip=True), th.get_text(strip=True))
            for th in table.find_all("th")
        ]
        rows = []
        for tr in table.find_all("tr"):
            row = []
            for td in tr.find_all("td"):
                # Remove elements with class "info"
                for info_element in td.find_all(class_="info"):
                    info_element.extract()

                # If an <a> tag exists, extract only its text
                first_link = td.find("a")
                if first_link:
                    cleaned_text = first_link.get_text(strip=True)
                else:
                    cleaned_text = " ".join(td.get_text(strip=True).split())
                row.append(cleaned_text)
            rows.append(row)

        print("Headers:", headers)

        # Ensure consistent column length
        max_columns = max(len(row) for row in rows)
        adjusted_rows = [row + [""] * (max_columns - len(row)) for row in rows]
        adjusted_headers = headers + [""] * (max_columns - len(headers))

        # Create a DataFrame
        df = pd.DataFrame(adjusted_rows, columns=adjusted_headers)

        # Clean empty cells by replacing whitespace-only entries with NaN
        df.replace(r"^\s*$", pd.NA, regex=True, inplace=True)

        # Drop columns with all empty values
        df = df.dropna(axis=1, how="all")

        # Drop rows with all empty values
        df = df.dropna(axis=0, how="all")

        # Save the cleaned DataFrame as CSV
        df.to_csv(csv_output_path, index=False, encoding="utf-8")

        print(f"Saved: {csv_output_path}")

print("Processing complete.")

Processing: EPD_DataSet.html
Headers: ['Field Name (de)', 'Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (de)', 'Definition (en)', 'Original ILCD Format Definition (en)', 'eDoc ID']
Saved: ../data/pipeline1/csv\EPD_DataSet.csv
Processing: EPD_FlowDataSet.html
Headers: ['Field Name (de)', 'Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (de)', 'Definition (en)', 'Original ILCD Format Definition (en)', 'eDoc ID']
Saved: ../data/pipeline1/csv\EPD_FlowDataSet.csv
Processing: ILCD_Common_DataTypes.html
Headers: ['Data Type', 'Base Type', 'Description']
Saved: ../data/pipeline1/csv\ILCD_Common_DataTypes.csv
Processing: ILCD_Common_EnumerationValues.html
Headers: ['List name', 'Value', 'Definition (en)']
Saved: ../data/pipeline1/csv\ILCD_Common_EnumerationValues.csv
Processing: ILCD_ContactDataSet.html
Headers: ['Field Name (en)', 'Element/Attribute Name', 'Requ.', 'Occ.', 'Datatype', 'Definition (en)', 'eDoc ID']
Sav