In [1]:
from dataclasses import dataclass
import enum
import typing

In [2]:
@enum.verify(enum.CONTINUOUS, enum.UNIQUE)
class TokenType(enum.Enum):
    SYMBOL = 1
    IDENTIFIER = 2
    LITERAL_STRING = 3
    LITERAL_INT = 4
    LITERAL_HEX = 5
    LITERAL_OCT = 6
    LITERAL_FLOAT = 7
    LITERAL_DATE = 8


@dataclass
class Token:
    """Contains Token type and associated portion of source code"""

    token_type: TokenType
    token_src: slice = slice(None, None, None)

In [23]:
def yield_token(codeblock: str) -> typing.Generator[Token, None, None]:
    """Iteratively tokenize ASP code string

    Parameters
    ----------
    codeblock : str
        Classic ASP source code

    Yields
    ------
    Token
        The next available token in the codeblock
    """
    code_iter: typing.Iterator[str] = iter(codeblock)
    # preload first character
    pos_char: typing.Optional[str] = next(code_iter, None)
    pos_idx: int = 0
    # global test_iter, pos_char, pos_idx

    def _advance_pos() -> bool:
        """Advance to the next position in the ASP code string

        Returns
        -------
        bool
            Returns True if codeblock iterator not exhausted
        """
        # modify state of enclosing function
        nonlocal code_iter, pos_char, pos_idx
        pos_char = next(code_iter, None)
        pos_idx += 1
        return pos_char is not None

    # main tokenizer loop
    # base case: exit at end of string (i.e., when iterator exhausted)
    while pos_char is not None:
        # determine token type
        if pos_char.isspace():
            # consume and ignore whitespace
            while _advance_pos() and pos_char.isspace():
                pass
            # already at next character, don't advance
        elif pos_char.isalpha():
            # basic example identifier: [a-zA-Z][a-zA-Z0-9]*
            start_iden = pos_idx  # save starting index of identifier for later
            # goto end of identifier
            while _advance_pos() and pos_char.isalnum():
                pass
            yield Token(TokenType.IDENTIFIER, slice(start_iden, pos_idx))
            del start_iden
            # already at next character, don't advance
        elif pos_char == '"':
            # string literal
            start_str = pos_idx  # save starting index of string literal for later
            # goto end of string
            found_dbl_quote = False
            terminated = False
            while _advance_pos():
                if pos_char == '"' and not found_dbl_quote:
                    found_dbl_quote = True
                    continue
                if found_dbl_quote:
                    if pos_char == '"':
                        # quote is escaped ('""'), keep iterating
                        found_dbl_quote = False
                        continue
                    else:
                        # end of string literal, stop iterating
                        terminated = True
                        break
            if not found_dbl_quote and not terminated:
                raise RuntimeError(
                    "Tokenizer error: Expected ending '\"' for string literal, but reached end of code string"
                )
            yield Token(TokenType.LITERAL_STRING, slice(start_str, pos_idx))
            del start_str, found_dbl_quote, terminated
            # already at next character, don't advance
        elif pos_char.isnumeric():
            # int or float literal
            pass
        elif pos_char == "&":
            # hex or oct literal
            _advance_pos()  # consume '&'
            if pos_char == "H":
                # hex literal
                _advance_pos()  # consume 'H'
                # need at least one hexadecimal digit
                if not (pos_char.isnumeric() or pos_char.casefold() in "abcdef"):
                    raise RuntimeError(
                        f"Tokenizer error: Expected at least one hexadecimal digit after '&H', but found '{pos_char}' instead"
                    )
                start_hex = pos_idx - 2  # include '&H' in Token object
                while _advance_pos() and (
                    pos_char.isnumeric() or pos_char.casefold() in "abcdef"
                ):
                    pass
                # check for optional '&' at end
                if pos_char == "&":
                    _advance_pos()  # consume
                yield Token(TokenType.LITERAL_HEX, slice(start_hex, pos_idx))
                del start_hex
                # already at next character, don't advance
            else:
                # oct literal
                # need at least one octal digit
                if not pos_char in "01234567":
                    raise RuntimeError(
                        f"Tokenizer error: Expected at least one octal digit after '&', but found '{pos_char}' instead"
                    )
                start_oct = pos_idx - 1  # include '&' in Token object
                while _advance_pos() and pos_char in "01234567":
                    pass
                # check for optional '&' at end
                if pos_char == "&":
                    _advance_pos()  # consume
                yield Token(TokenType.LITERAL_OCT, slice(start_oct, pos_idx))
                del start_oct
                # already at next character, don't advance
        elif pos_char == "#":
            # date literal
            pass
        else:
            # other token type
            yield Token(TokenType.SYMBOL, slice(pos_idx, pos_idx + 1))
            _advance_pos()  # move past current token


def print_tokenizer_output(codeblock: str):
    print("Source code:", codeblock, sep="\n")
    print("| Type".ljust(30), "| Source", sep="")
    print("-" * 45)
    for tok in yield_token(codeblock):
        print("| " + str(tok.token_type).ljust(28), "| " + codeblock[tok.token_src], sep="")
    print()  # add extra spacing


print_tokenizer_output("""<% Response.Write("Hello, world!") %>""")
print_tokenizer_output("&H7f &H10abCf33&")  # hex literal
print_tokenizer_output("&12736 &171&")  # oct literal
print_tokenizer_output('"This is a valid string"')

Source code:
<% Response.Write("Hello, world!") %>
| Type                        | Source
---------------------------------------------
| TokenType.SYMBOL            | <
| TokenType.SYMBOL            | %
| TokenType.IDENTIFIER        | Response
| TokenType.SYMBOL            | .
| TokenType.IDENTIFIER        | Write
| TokenType.SYMBOL            | (
| TokenType.LITERAL_STRING    | "Hello, world!"
| TokenType.SYMBOL            | )
| TokenType.SYMBOL            | %
| TokenType.SYMBOL            | >

Source code:
&H7f &H10abCf33&
| Type                        | Source
---------------------------------------------
| TokenType.LITERAL_HEX       | &H7f
| TokenType.LITERAL_HEX       | &H10abCf33&

Source code:
&12736 &171&
| Type                        | Source
---------------------------------------------
| TokenType.LITERAL_OCT       | &12736
| TokenType.LITERAL_OCT       | &171&

Source code:
"This is a valid string"
| Type                        | Source
-----------------------------------