In [169]:
import re
import tokenize

In [173]:
class Token(str):
    def __new__(cls, data, kind):
        return str.__new__(Token, data)
        
    def __init__(self, data, kind):
        self.kind = kind
        
    def __repr__(self):
        r = super().__repr__()
        return 'Token(%s, %r)' % (r, self.kind)

    
def make_lexer(defs, ignore=(), keywords=()):
    """
    Retorna um lexer a partir de um dicionario com
    definicoes de tokens
    """
    defs['error'] = r'.+?'
    named = [r'(?P<%s>%s)' % (k, v) 
             for (k, v) in defs.items()]
    regex = re.compile('|'.join(named))
    
    def lexer(code):
        tokens = []
        line_no = 1
        indent = 0
        
        for m in regex.finditer(code):
            i, j = m.span()
            data = m.string[i:j]
            kind = m.lastgroup
            
            if kind == 'space':
                line_no += data.count('\n')
            if data in keywords:
                tokens.append(Token(data, 'keyword'))
            elif kind == 'error':
                raise ValueError(f'invalido: {data!r}')
            elif kind not in ignore:
                tokens.append(Token(data, kind))
            
        return tokens
        
    return lexer

In [174]:
defs = {
    'var': r'[a-zA-Z_]\w*',
    'int': r'\d+',
    'op': r'\+|\-|\*\*?|==|!=|<=?|>=?',
    'colon': r':',
    'lpar': r'\(',
    'rpar': r'\)',
    'comment': r'\#[^\n]*',
    'space': r'\s+',
}

In [172]:
code = """
def fat(n):
    return 1 if n == 0 else n * fat(n - 1)
"""

In [177]:
lex = make_lexer(
    defs, ignore={'space', 'comment'},
    keywords={'def', 'if', 'else', 'return'},
)
lex('definicao if 42 else 314')

[Token('definicao', 'var'),
 Token('if', 'keyword'),
 Token('42', 'int'),
 Token('else', 'keyword'),
 Token('314', 'int')]

In [156]:
str('42')

'42'

In [78]:
Token('def', 'keyword')

'def'