In [2]:
from dataclasses import dataclass
import enum
import typing

In [3]:
@enum.verify(enum.CONTINUOUS, enum.UNIQUE)
class TokenType(enum.Enum):
    SYMBOL = 1
    IDENTIFIER = 2
    LITERAL_STRING = 3
    LITERAL_INT = 4
    LITERAL_HEX = 5
    LITERAL_OCT = 6
    LITERAL_FLOAT = 7
    LITERAL_DATE = 8


@dataclass
class Token:
    """Contains Token type and associated portion of source code"""

    token_type: TokenType
    token_src: slice = slice(None, None, None)

In [4]:
test_code = """<% Response.Write("Hello, world!") %>"""
test_code

'<% Response.Write("Hello, world!") %>'

In [30]:
test_iter: typing.Iterator[str] = iter(test_code)
# preload first character
pos_char: typing.Optional[str] = next(test_iter, None)
pos_idx: int = 0


def advance_pos():
    """Advance to the next position in the ASP code string"""
    global test_iter, pos_char, pos_idx
    pos_char = next(test_iter, None)
    pos_idx += 1
    return pos_char is not None


def yield_token() -> typing.Generator[Token, None, None]:
    """Iteratively tokenize ASP code string"""
    # modify the global state
    # (should move this to a class)
    global test_iter, pos_char, pos_idx

    # main tokenizer loop
    # base case: exit at end of string (i.e., when iterator exhausted)
    while pos_char is not None:
        # determine token type
        if pos_char.isspace():
            # consume and ignore whitespace
            while advance_pos() and pos_char.isspace():
                pass
            # already at next character, don't advance
        elif pos_char.isalpha():
            # basic example identifier: [a-zA-Z][a-zA-Z0-9]*
            start_iden = pos_idx  # save starting index of identifier for later
            # goto end of identifier
            while advance_pos() and pos_char.isalnum():
                pass
            yield Token(TokenType.IDENTIFIER, slice(start_iden, pos_idx))
            del start_iden
            # already at next character, don't advance
        elif pos_char == '"':
            # string literal
            start_str = pos_idx  # save starting index of string literal for later
            # goto end of string
            found_dbl_quote = False
            terminated = False
            while advance_pos():
                if pos_char == '"' and not found_dbl_quote:
                    found_dbl_quote = True
                    continue
                if found_dbl_quote:
                    if pos_char == '"':
                        # quote is escaped ('""'), keep iterating
                        found_dbl_quote = False
                        continue
                    else:
                        # end of string literal, stop iterating
                        terminated = True
                        break
            if not terminated:
                raise RuntimeError(
                    "Tokenizer error: Expected ending '\"' for string literal, but reached end of code string"
                )
            yield Token(TokenType.LITERAL_STRING, slice(start_str, pos_idx))
            del found_dbl_quote, terminated
            # already at next character, don't advance
        else:
            # other token type
            yield Token(TokenType.SYMBOL, slice(pos_idx, pos_idx + 1))
            advance_pos()  # move past current token


print("Type".ljust(25), "Source")
print("-" * 25, "-" * 15)
for tok in yield_token():
    print(str(tok.token_type).ljust(25), test_code[tok.token_src])

Type                      Source
------------------------- ---------------
TokenType.SYMBOL          <
TokenType.SYMBOL          %
TokenType.IDENTIFIER      Response
TokenType.SYMBOL          .
TokenType.IDENTIFIER      Write
TokenType.SYMBOL          (
TokenType.LITERAL_STRING  "Hello, world!"
TokenType.SYMBOL          )
TokenType.SYMBOL          %
TokenType.SYMBOL          >
