In [2]:
import wikitextparser as wtp

def wiki_table_to_html(node):
    result = ['<table>']
    first_row = False
    header_loop = False

    # Parse the table using wikitextparser
    table = wtp.Table(node)

    for row in table.get_rows():

        # The header loop will not be necessary
        if row.startswith("|-"):
            first_row = True  # Mark that the first row has been encountered

        # Check if the header loop is active and if the current row is not a header cell
        if header_loop is True and not row.startswith("!"):
            header_loop = False  # End the header loop
            result.append('</tr>')  # Close the header row

        # Handle captions
        if (row.startswith("|+") and first_row is False):
            # Extract the caption text, removing the '+' and any leading/trailing whitespace
            caption_text = row[2:].strip()
            result.append(f'<caption>{caption_text}</caption>')

        # Handle the special case for header cells
        elif (row.startswith("!") and first_row is False):
            # If this is the first header cell, start a new row and mark the header loop as active
            if not header_loop:
                result.append('<tr>')
                header_loop = True

            # Split the row into individual cells
            cells = row.split("!!")
            for cell in cells:
                cell = cell.strip("! \n")
                result.append('<th>')
                result.append(cell)
                result.append('</th>')

        # Handle the default case for table rows
        elif row.startswith("|"):
            result.append('<tr>')

            # Split the row into individual cells
            cells = row.split("||")
            for cell in cells:
                cell = cell.strip("| \n")

                # Check if the cell is a header or data cell
                if cell.startswith("!"):
                    cell_tag = 'th'
                    cell = cell.strip("! \n")
                else:
                    cell_tag = 'td'

                # Extract rowspan and colspan attributes if they exist
                attrs = []
                if "rowspan" in cell:
                    attrs.append(f'rowspan="{cell.split("rowspan=")[1].split()[0]}"')
                if "colspan" in cell:
                    attrs.append(f'colspan="{cell.split("colspan=")[1].split()[0]}"')

                # Construct the opening cell tag with attributes (if any)
                attrs_str = ' '.join(attrs)
                result.append(f'<{cell_tag} {attrs_str}>' if attrs_str else f'<{cell_tag}>')

                # Process the contents of the cell
                result.append(cell.split("|")[-1].strip())

                # Close the cell tag
                result.append(f'</{cell_tag}>')

            result.append('</tr>')  # Close the row

    # Close the table tag and return the result as a single string
    result.append('</table>')
    return ''.join(result)

## Test 

WTP is able to identify the table properly, however, it does not offer the fine-grained detail of mwparserfromhell. Nevertheless, we can combine both though. One idea would be to use WTP as the main parser engine and mwparserfromhell to transform tables

In [3]:
wiki_table = """
{| class="wikitable"
|+ Test Case 1: Simple rowspan
|-
! Header 1
! Header 2
! Header 3
|-
| A1 || rowspan="2" | B1-B2 || C1
|-
| A2 || C2
|-
| A3 || B3 || C3
|}
"""

# Parse the wikitext using wikitextparser
parsed_wikicode = wtp.parse(wiki_table)

print(parsed_wikicode)


{| class="wikitable"
|+ Test Case 1: Simple rowspan
|-
! Header 1
! Header 2
! Header 3
|-
| A1 || rowspan="2" | B1-B2 || C1
|-
| A2 || C2
|-
| A3 || B3 || C3
|}


KeyboardInterrupt: 