In [1]:
from dataclasses import dataclass
import enum
import typing

import attrs

In [2]:
@enum.verify(enum.CONTINUOUS, enum.UNIQUE)
class TokenType(enum.Enum):
    """Enumeration containing supported token types"""

    SYMBOL = 1
    IDENTIFIER = 2
    LITERAL_STRING = 3
    LITERAL_INT = 4
    LITERAL_HEX = 5
    LITERAL_OCT = 6
    LITERAL_FLOAT = 7
    LITERAL_DATE = 8

In [3]:
@dataclass
class Token:
    """Represents an individual token.

    Attributes
    ----------
    token_type : TokenType
        Enumerated token type
    token_src : slice, default=slice(None, None, None)
        Section of original code associated with this token
    """

    token_type: TokenType
    token_src: slice = slice(None, None, None)  # default: entire string

In [4]:
@attrs.define()
class Tokenizer:
    codeblock: str
    _code_iter: typing.Iterator[str] = attrs.field(default=None, repr=False, init=False)
    # keep track of position within codeblock
    _pos_char: typing.Optional[str] = attrs.field(default=None, repr=False, init=False)
    _pos_idx: typing.Optional[int] = attrs.field(default=None, repr=False, init=False)

    def __iter__(self) -> typing.Self:
        """Setup iteration

        Returns
        -------
        Self
        """
        self._code_iter = iter(self.codeblock)
        # preload first character
        self._pos_char = next(
            self._code_iter, None
        )  # use next(..., None) instead of handling StopIteration
        self._pos_idx = 0
        return self

    def _advance_pos(self) -> bool:
        """Advance to the next position in the codeblock

        Returns
        -------
        bool
            True if codeblock iterator not exhausted
        """
        self._pos_char = next(self._code_iter, None)
        self._pos_idx += 1
        return self._pos_char is not None

    def _check_for_end(self):
        """If tokenizer reached the end of the codeblock,
        signal to the consumer that iteration should stop.

        The StopIteration exception will bubble up through __next__.
        """
        if self._pos_char is None:
            self._code_iter = None
            self._pos_idx = None
            raise StopIteration

    def _skip_whitespace(self):
        """Consume and ignore extraneous whitespace"""
        if self._pos_char.isspace():
            while self._advance_pos() and self._pos_char.isspace():
                pass
            self._check_for_end()

    def _handle_string_literal(self) -> Token:
        """"""
        # save starting index of string literal for later
        start_str: int = self._pos_idx
        # helper variables to keep track of state
        found_dbl_quote = False  # was previous character a double quote?
        terminated = False  # reached end of string literal

        # goto end of string literal
        while self._advance_pos():
            if self._pos_char == '"' and not found_dbl_quote:
                found_dbl_quote = True
                # check next character to see if this is really the end of the string literal
                continue
            if found_dbl_quote:
                if self._pos_char == '"':
                    # quote is escaped ('""'), keep looping
                    found_dbl_quote = False
                    continue
                # string literal ends before codeblock does, stop looping
                terminated = True
                break

        if not found_dbl_quote and not terminated:
            raise RuntimeError(
                "Expected ending '\"' for string literal, but reached end of code string"
            )

        return Token(TokenType.LITERAL_STRING, slice(start_str, self._pos_idx))

    def _handle_number_literal(self) -> Token:
        """"""
        start_num: int = (
            self._pos_idx
        )  # don't know token type, but save starting position for later
        # goto end of current number chunk
        while self._advance_pos() and self._pos_char.isnumeric():
            pass

        # TODO: handle float that starts with '.' (no leading digits)
        # does the token have a decimal point?
        float_dec_pt = self._pos_char == "."
        if float_dec_pt:
            self._advance_pos()  # consume '.'
            # there should be one or more digits after '.'
            if self._pos_char is None or not self._pos_char.isnumeric():
                raise RuntimeError("Expected digit after '.' in float literal")
            # goto end of current number chunk
            while self._advance_pos() and self._pos_char.isnumeric():
                pass

        # does the token have the scientific notation indicator?
        float_sci_e = self._pos_char == "E"
        if float_sci_e:
            self._advance_pos()  # consume 'E'
            # optional '+' or '-'
            if self._pos_char is not None and self._pos_char in "+-":
                self._advance_pos()  # consume
            # there should be one or more digits after 'E' (or after '+'/'-')
            if self._pos_char is None or not self._pos_char.isnumeric():
                raise RuntimeError("Expected digit after 'E' in float literal")
            # goto end of current number chunk
            while self._advance_pos() and self._pos_char.isnumeric():
                pass

        return Token(
            # is this an int or a float?
            (
                TokenType.LITERAL_FLOAT
                if float_dec_pt or float_sci_e
                else TokenType.LITERAL_INT
            ),
            slice(start_num, self._pos_idx),
        )

    def _handle_amp_literal(self) -> Token:
        """"""
        self._advance_pos()  # consume '&'
        if self._pos_char == "H":
            # ======== HEX LITERAL ========
            self._advance_pos()  # consume 'H'
            # need at least one hexadecimal digit
            if self._pos_char is None or not (
                self._pos_char.isnumeric() or self._pos_char.casefold() in "abcdef"
            ):
                raise RuntimeError(
                    f"Expected at least one hexadecimal digit after '&H', but found {repr(self._pos_char)} instead"
                )
            start_hex = self._pos_idx - 2  # include '&H' in Token object
            while self._advance_pos() and (
                self._pos_char.isnumeric() or self._pos_char.casefold() in "abcdef"
            ):
                pass
            # check for optional '&' at end
            if self._pos_char == "&":
                self._advance_pos()  # consume
            return Token(TokenType.LITERAL_HEX, slice(start_hex, self._pos_idx))
        else:
            # ======== OCT LITERAL ========
            # need at least one octal digit
            if self._pos_char is None or not self._pos_char in "01234567":
                raise RuntimeError(
                    f"Expected at least one octal digit after '&', but found {repr(self._pos_char)} instead"
                )
            start_oct: int = self._pos_idx - 1  # include '&' in Token object
            # goto end of oct literal
            while self._advance_pos() and self._pos_char in "01234567":
                pass
            # check for optional '&' at end
            if self._pos_char == "&":
                self._advance_pos()  # consume
            return Token(TokenType.LITERAL_OCT, slice(start_oct, self._pos_idx))

    def _handle_date_literal(self) -> Token:
        return Token(TokenType.LITERAL_DATE)

    def __next__(self) -> Token:
        """Retrieve the next token

        Returns
        -------
        Token

        Raises
        ------
        StopIteration
            When tokenizer reaches the end of the codeblock
        RuntimeError
            When tokenizer encounters an invalid token
        """
        # stop if at end of codeblock
        self._check_for_end()

        # extraneous whitespace != code
        self._skip_whitespace()

        # determine token type
        if self._pos_char.isalpha():
            pass
        elif self._pos_char == '"':
            return self._handle_string_literal()
        elif self._pos_char.isnumeric():
            return self._handle_number_literal()
        elif self._pos_char == "&":
            return self._handle_amp_literal()
        elif self._pos_char == "#":
            return self._handle_date_literal()
        else:
            self._advance_pos()  # consume symbol
            return Token(TokenType.SYMBOL, slice(self._pos_idx - 1, self._pos_idx))


test_code = '< % . "string" 1000 1.0 1E3 1.0E3 1.0E+3 &H7f &HAB &123'
print("| Type".ljust(30), "| Source", sep="")
print("|", "-" * 29, "|", "-" * 14, sep="")
for tok in Tokenizer(test_code):
    print(
        "| " + str(tok.token_type).ljust(28),
        "| " + test_code[tok.token_src],
        sep="",
    )

| Type                        | Source
|-----------------------------|--------------
| TokenType.SYMBOL            | <
| TokenType.SYMBOL            | %
| TokenType.SYMBOL            | .
| TokenType.LITERAL_STRING    | "string"
| TokenType.LITERAL_INT       | 1000
| TokenType.LITERAL_FLOAT     | 1.0
| TokenType.LITERAL_FLOAT     | 1E3
| TokenType.LITERAL_FLOAT     | 1.0E3
| TokenType.LITERAL_FLOAT     | 1.0E+3
| TokenType.LITERAL_HEX       | &H7f
| TokenType.LITERAL_HEX       | &HAB
| TokenType.LITERAL_OCT       | &123
