In [0]:
import re
import io
import tokenize


BLACKLIST = {tokenize.NL, tokenize.INDENT, tokenize.DEDENT, 
             tokenize.NEWLINE, tokenize.ENDMARKER, tokenize.ENCODING}

In [0]:
def make_lexer(defs, ignore=(), keywords=()):
    """
    Retorna um lexer a partir de um dicionario com
    definicoes de tokens
    """
    defs['error'] = r'.+?'
    named = [r'(?P<%s>%s)' % (k, v) 
             for (k, v) in defs.items()]
    regex = re.compile('|'.join(named))
    
    def lexer(code):
        tokens = []
        line_no = 1
        indent = 0
        
        for m in regex.finditer(code):
            i, j = m.span()
            data = m.string[i:j]
            kind = m.lastgroup
            
            if kind == 'space':
                line_no += data.count('\n')
            if data in keywords:
                tokens.append(Token(data, 'keyword'))
            elif kind == 'error':
                raise ValueError(f'invalido: {data!r}')
            elif kind not in ignore:
                tokens.append((data, kind))
            
        return tokens
        
    return lexer

In [0]:
py_lex = lambda code: list(py_tokens(code))

def py_tokens(code):
    fd = io.BytesIO(code.encode('utf8'))
    tks = tokenize.tokenize(fd.__next__)
    next(tks)
    for tk in tks:
        if tk.type not in BLACKLIST:
            yield (tk.string, tokenize.tok_name[tk.type])

In [0]:
# CRIE SEU LEXER AQUI
my_lex = make_lexer({
    'NAME': r'[a-zA-Z_]\w*',
    'SPACE': r'\s+',
    'ANY': r'.+?',
}, ignore={'SPACE', 'COMMMENT'})

In [19]:
code = """
-42
"""

cmp = list(zip(my_lex(code), py_lex(code)))
print('todos iguais?', all(x == y for x, y in cmp))
cmp

todos iguais? False


[(('-', 'ANY'), ('-', 'OP')), (('4', 'ANY'), ('42', 'NUMBER'))]