In [1]:
import re
from re import match, search, findall

## Exemplos de busca

In [2]:
match('abc', 'abc')

<_sre.SRE_Match object; span=(0, 3), match='abc'>

In [3]:
search('abc', 'efbabc')

<_sre.SRE_Match object; span=(3, 6), match='abc'>

In [4]:
findall('abc', '123abc456abc')

['abc', 'abc']

## Meta caracteres

### Ponto
Representa qualquer caracter (exceto nova linha)

In [5]:
match('.', 'abc')

<_sre.SRE_Match object; span=(0, 1), match='a'>

In [6]:
match('.', '012')

<_sre.SRE_Match object; span=(0, 1), match='0'>

In [7]:
match('.', '    ')

<_sre.SRE_Match object; span=(0, 1), match=' '>

In [8]:
match('.', '\t\t')

<_sre.SRE_Match object; span=(0, 1), match='\t'>

In [9]:
search('.', '\nabc')

<_sre.SRE_Match object; span=(1, 2), match='a'>

In [10]:
findall('.', 'abc')

['a', 'b', 'c']

### Âncoras ^ $

In [11]:
# ^ âncora de início de linha
findall('^.', 'abc\ndef\nghi')

['a']

In [12]:
# busca o primeiro caracter de cada linha
findall('^.', 'abc\ndef\nghi', re.MULTILINE)

['a', 'd', 'g']

In [13]:
# ^ âncora de fim de linha
findall('.$', 'abc\ndef\nghi')

['i']

In [14]:
# busca o último caracter de cada linha
findall('.$', 'abc\ndef\nghi', re.MULTILINE)

['c', 'f', 'i']

In [15]:
# utilizando as duas âncoras juntas
match('^.$', 'a')

<_sre.SRE_Match object; span=(0, 1), match='a'>

In [16]:
# início encontrando o final - apenas com string vazia
match('^$', '')

<_sre.SRE_Match object; span=(0, 0), match=''>

In [17]:
findall('^$', '\n', re.MULTILINE)

['', '']

## Classe de caracteres

In [18]:
findall('[aeiou]', 'eliézer bourchardt')

['e', 'i', 'e', 'o', 'u', 'a']

In [19]:
# Negação utiliza o ^ dentro dos colchetes
findall('[^aeiou]', 'eliézer bourchardt')

['l', 'é', 'z', 'r', ' ', 'b', 'r', 'c', 'h', 'r', 'd', 't']

In [20]:
# Range
findall('[a-f]', 'Eliézer Bourchardt')

['e', 'c', 'a', 'd']

In [21]:
findall('[a-fA-Z]', 'Eliézer Bourchardt')

['E', 'e', 'B', 'c', 'a', 'd']

In [22]:
findall('[a-zA-Z]', 'Eliézer Bourchardt 84')

['E',
 'l',
 'i',
 'z',
 'e',
 'r',
 'B',
 'o',
 'u',
 'r',
 'c',
 'h',
 'a',
 'r',
 'd',
 't']

## Sequencias especiais

In [23]:
# Sequencias especiais
#findall('[a-zA-Z0-9_]', 'Eliézer Bourchardt 84') equivale a:
findall('\w', 'Eliézer Bourchardt 84')


['E',
 'l',
 'i',
 'é',
 'z',
 'e',
 'r',
 'B',
 'o',
 'u',
 'r',
 'c',
 'h',
 'a',
 'r',
 'd',
 't',
 '8',
 '4']

In [24]:
# Outras sequencias:
# \d == [0-9]
# \D == [^0-9]
# \s == [\t\n\r\f\v]
# \S == [^\t\n\r\f\v]
# \w == [a-zA-Z0-9_]
# \W == [^a-zA-Z0-9_]

## | (ou)

In [25]:
match('a|b', 'abc')

<_sre.SRE_Match object; span=(0, 1), match='a'>

In [26]:
match('a|b', 'bce')

<_sre.SRE_Match object; span=(0, 1), match='b'>

In [27]:
match('a|b', 'cde')

## Repetições com quantidades específicas

In [28]:
# {4} == exatamente 4
match(r'\d{4}', '1234') 

<_sre.SRE_Match object; span=(0, 4), match='1234'>

In [29]:
match(r'\d{4}', '1234567')

<_sre.SRE_Match object; span=(0, 4), match='1234'>

In [30]:
match(r'\d{4}', '123')

In [31]:
# Quantidades mínimas
match(r'\d{2,}', '12')

<_sre.SRE_Match object; span=(0, 2), match='12'>

In [32]:
# no mínimo 2, ganancioso
match(r'\d{2,}', '12345678')

<_sre.SRE_Match object; span=(0, 8), match='12345678'>

In [33]:
match(r'\d{2,}', '1')

In [34]:
# no mínimo 2, preguiçoso
# O ? é um modificador que transforma o ganancioso em preguiçoso
match(r'\d{2,}?', '12345678')

<_sre.SRE_Match object; span=(0, 2), match='12'>

In [35]:
# Mínimo e Máximo
match(r'\d{2,4}', '12345')

<_sre.SRE_Match object; span=(0, 4), match='1234'>

In [36]:
match(r'\d{2,4}', '1234')

<_sre.SRE_Match object; span=(0, 4), match='1234'>

In [37]:
match(r'\d{2,4}', '123')

<_sre.SRE_Match object; span=(0, 3), match='123'>

In [38]:
match(r'\d{2,4}', '12')

<_sre.SRE_Match object; span=(0, 2), match='12'>

In [39]:
match(r'\d{2,4}', '1')

In [40]:
match(r'\d{2,4}?', '12345')

<_sre.SRE_Match object; span=(0, 2), match='12'>

### 0 ou 1 ocorrência

In [41]:
match(r'\d{0,1}', '12345')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [42]:
# que é igual 
match(r'\d{,1}', '12345')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [43]:
# Negação
match(r'\d{,1}?', '12345')

<_sre.SRE_Match object; span=(0, 0), match=''>

In [44]:
# Pode ser reduzido:
match(r'\d?', '12345')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [45]:
# E transformado em preguiçoso
match(r'\d??', '12345')

<_sre.SRE_Match object; span=(0, 0), match=''>

### 0 ou mais vezes

In [46]:
match(r'\d{0,}', '12345')

<_sre.SRE_Match object; span=(0, 5), match='12345'>

In [47]:
# Que é igual
match(r'\d{,}', '12345')

<_sre.SRE_Match object; span=(0, 5), match='12345'>

In [48]:
# também pode ser reduzido
match(r'\d*', '12345')

<_sre.SRE_Match object; span=(0, 5), match='12345'>

In [49]:
# E transformado em preguiçoso
match(r'\d*?', '12345')

<_sre.SRE_Match object; span=(0, 0), match=''>

### 1 ou mais vezes

In [50]:
match(r'\d{1,}', '12345')

<_sre.SRE_Match object; span=(0, 5), match='12345'>

In [51]:
# Que é igual

In [52]:
match(r'\d+', '12345')

<_sre.SRE_Match object; span=(0, 5), match='12345'>

In [53]:
match(r'\d+', 'abc')

In [54]:
# Transforma em preguiçoso
match(r'\d+?', '12345')

<_sre.SRE_Match object; span=(0, 1), match='1'>

## Exemplo prático da importância da repetição preguiçosa

In [55]:
text = 'attr1="value1" attr2="value2"'

In [56]:
findall(r'".+"', text)

['"value1" attr2="value2"']

In [57]:
findall(r'".+?"', text)

['"value1"', '"value2"']

In [58]:
text = 'attr1="" attr2=""'

In [59]:
findall(r'".+?"', text)

['"" attr2="']

In [60]:
findall(r'".*?"', text)

['""', '""']

## Match Objects

In [61]:
m = match(r'\d+', '12345')

In [62]:
type(m)

_sre.SRE_Match

In [63]:
m.group()

'12345'

In [64]:
m.start()

0

In [65]:
m.end()

5

In [66]:
m.span()

(0, 5)

## Grupo de Captura

In [67]:
html = '<input type="text" id="id_cpf" name="cpf">'
pattern = r'<(.+?) type="(.+?)" id="(.+?)" name="(.+?)"'

In [68]:
m = match(pattern, html)

In [69]:
m

<_sre.SRE_Match object; span=(0, 41), match='<input type="text" id="id_cpf" name="cpf"'>

In [70]:
m.groups()

('input', 'text', 'id_cpf', 'cpf')

In [71]:
m.group(0)

'<input type="text" id="id_cpf" name="cpf"'

In [72]:
m.group(1)

'input'

In [73]:
m.group(1, 2, 3)

('input', 'text', 'id_cpf')

### Generalização do pattern

In [74]:
html1 = '<input type="text" id="id_cpf" name="cpf">'
html2 = '<input id="id_cpf" name="cpf" type="text">'

In [75]:
pattern = r'<(.+?) (?:(?:type="(.+?)"|id="(.+?)"|name="(.+?)") ?)*'

In [76]:
m = match(pattern, html1)
m

<_sre.SRE_Match object; span=(0, 41), match='<input type="text" id="id_cpf" name="cpf"'>

In [77]:
m.groups()

('input', 'text', 'id_cpf', 'cpf')

In [78]:
m = match(pattern, html2)
m

<_sre.SRE_Match object; span=(0, 41), match='<input id="id_cpf" name="cpf" type="text"'>

In [79]:
m.groups()

('input', 'text', 'id_cpf', 'cpf')

### Nome para os grupos (?P é uma notação do Python)

In [80]:
pattern = r'<(?P<tag>.+?) (?:(?:type="(?P<type>.+?)"|id="(?P<id>.+?)"|name="(?P<name>.+?)") ?)*'

In [81]:
m = match(pattern, html1)
m

<_sre.SRE_Match object; span=(0, 41), match='<input type="text" id="id_cpf" name="cpf"'>

In [82]:
m.groups()

('input', 'text', 'id_cpf', 'cpf')

In [83]:
m.groupdict()

{'id': 'id_cpf', 'name': 'cpf', 'tag': 'input', 'type': 'text'}