In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os

def html_table_to_df(html_file_path):
    """
    Converts an HTML table to a Pandas DataFrame.
    Extracts the actual emoji character from the img alt tag into a new 'Emoji' column.

    Args:
        html_file_path (str): The path to the HTML file.

    Returns:
        pandas.DataFrame: The DataFrame representation of the table, or None on error.
    """
    try:
        with open(html_file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
    except FileNotFoundError:
        print(f"Error: File not found at '{html_file_path}'")
        return None
    except Exception as e:
        print(f"Error reading or parsing HTML file: {e}")
        return None

    table = soup.find('table', {'border': '1'})
    if not table:
        print("Error: No table with border='1' found in the HTML.")
        return None

    data = []
    all_rows = table.find_all('tr')
    header_row_index = 0
    for i, row in enumerate(all_rows):
        if not row.find('th', {'class': ['bighead', 'mediumhead']}):
            header_row_index = i
            break

    headers = []
    if len(all_rows) > header_row_index:
        header_row = all_rows[header_row_index]
        for th in header_row.find_all(['th', 'td']):
            headers.append(th.get_text(strip=True))

    for row in all_rows[header_row_index + 1:]:  # Start from the row *after* the header
        if row.find('th', {'class': ['bighead', 'mediumhead']}):
            continue

        row_data = []
        for cell in row.find_all(['td', 'th']):
            # Find the img tag within the cell
            img_tag = cell.find('img')
            if img_tag:
                # Extract the emoji from the 'alt' attribute
                emoji = img_tag.get('alt', '')  # Get alt text, default to empty string if not found
                row_data.append(emoji)
            else:
                row_data.append(cell.get_text(strip=True))

        if row_data:
            data.append(row_data)

    df = pd.DataFrame(data, columns=headers)
    #Rename the Sample column.
    df.rename(columns={'Sample':'Emoji'}, inplace=True)

    return df



# --- Example usage ---
file_path = 'index.html'

df = html_table_to_df(file_path)
df = df[df["№"] != "№"]
df

Unnamed: 0,№,Code,Emoji,CLDR Short Name,Other Keywords
0,1,U+1F600,😀,grinning face,cheerful | cheery | face | grin | grinning | h...
1,2,U+1F603,😃,grinning face with big eyes,awesome | big | eyes | face | grin | grinning ...
2,3,U+1F604,😄,grinning face with smiling eyes,eye | eyes | face | grin | grinning | happy | ...
3,4,U+1F601,😁,beaming face with smiling eyes,beaming | eye | eyes | face | grin | grinning ...
4,5,U+1F606,😆,grinning squinting face,closed | eyes | face | grinning | haha | hahah...
...,...,...,...,...,...
2052,1906,U+1F1FF U+1F1F2,🇿🇲,flag: Zambia,flag
2053,1907,U+1F1FF U+1F1FC,🇿🇼,flag: Zimbabwe,flag
2055,1908,U+1F3F4 U+E0067 U+E0062 U+E0065 U+E006E U+E006...,🏴󠁧󠁢󠁥󠁮󠁧󠁿,flag: England,flag
2056,1909,U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E007...,🏴󠁧󠁢󠁳󠁣󠁴󠁿,flag: Scotland,flag


In [2]:
df.to_excel('emojis.xlsx', index=False)  # Save to Excel