# Covert ILCD HTML files to CSV

In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd

# Specify the directory containing the HTML files
html_directory = '../data/html'
output_directory = '../data/csv'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Mapping of inconsistent headers to standardized headers
header_mapping = {
    'Field name': 'Field Name (en)',
    'Field name (de)': 'Field Name (de)',
    'Field name (en)': 'Field Name (en)',
    'Element/Attribute name': 'Element/Attribute Name',
    'Element/attribute name': 'Element/Attribute Name',
    'Data type': 'Datatype',
    'Definition': 'Definition (en)',
}

# Loop through all HTML files in the directory
for file_name in os.listdir(html_directory):
    if file_name.endswith('.html'):
        html_file_path = os.path.join(html_directory, file_name)
        csv_file_name = os.path.splitext(file_name)[0] + '.csv'
        csv_output_path = os.path.join(output_directory, csv_file_name)

        print(f"Processing: {file_name}")

        # Load the HTML content
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the table by its ID (adjust if necessary for your files)
        table = soup.find('table', {'id': 'tableID'})
        if not table:
            print(f"No table with id 'tableID' found in {file_name}. Skipping.")
            continue

        # Extract headers and rows from the table
        headers = [header_mapping.get(th.get_text(strip=True), th.get_text(strip=True)) for th in table.find_all('th')]
        rows = [
            [' '.join(td.get_text(strip=True).split()) for td in tr.find_all('td')]
            for tr in table.find_all('tr')
        ]

        # Ensure consistent column length
        max_columns = max(len(row) for row in rows)
        adjusted_rows = [row + [''] * (max_columns - len(row)) for row in rows]
        adjusted_headers = headers + [''] * (max_columns - len(headers))

        # Create a DataFrame
        df = pd.DataFrame(adjusted_rows, columns=adjusted_headers)

        # Clean empty cells by replacing whitespace-only entries with NaN
        df.replace(r"^\s*$", pd.NA, regex=True, inplace=True)

        # Print summary before dropping empty columns
        print("Before dropping empty columns:")
        print(df.info())

        # Drop columns with all empty values
        df = df.dropna(axis=1, how='all')

        # Drop rows with all empty values
        df = df.dropna(axis=0, how="all")

        # Print summary after dropping empty columns
        print("After dropping empty columns:")
        print(df.info())

        # Save the cleaned DataFrame as CSV
        df.to_csv(csv_output_path, index=False, encoding='utf-8')

        print(f"Saved: {csv_output_path}")

print("Processing complete.")


Processing: EPD_DataSet.html
Before dropping empty columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Field Name (de)                       161 non-null    object
 1   Field Name (en)                       159 non-null    object
 2   Element/Attribute Name                160 non-null    object
 3   Requ.                                 160 non-null    object
 4   Occ.                                  120 non-null    object
 5   Datatype                              103 non-null    object
 6   Definition (de)                       137 non-null    object
 7   Definition (en)                       125 non-null    object
 8   Original ILCD Format Definition (en)  102 non-null    object
 9   eDoc ID                               102 non-null    object
 10                                        