In [None]:
import re

## To use or not to use re.compile()

In [None]:
!python -m timeit -s "import re" "re.match('hello', 'hello world')"

In [None]:
!python -m timeit -s "import re; h=re.compile('hello')" "h.match('hello world')"

In [None]:
!python -m timeit -s "import re; n=1000" "[ re.match('hello', 'hello world') for i in range(n) ]"

In [None]:
!python -m timeit -s "import re; n=1000" "h=re.compile('hello'); [ h.match('hello world') for i in range(n) ]"

**Hint**: If you need to use a particular pattern multiple times, it's better to compile it

## split()

In [None]:
text = "Intact Financial Corporation is   the largest   provider of property and casualty   insurance in Canada"

In [None]:
text.split()

In [None]:
regex = re.compile(r'\s+')
re.split(regex, text)

## findall()

In [None]:
text = "Headquarters: Toronto\nRevenue: 8.747 billion CAD (2017)\nNumber of employees: Over 13,000 (2017)"
print(text)

In [None]:
regex = re.compile('\d+')
regex.findall(text)

#### Task: ['8.747', '2017', '13,000', '2017']

In [None]:
regex = re.compile('\d+[\.,]{0,1}\d+')
re.findall(regex, text)

## search() vs. match()

Python offers two different primitive operations based on regular expressions: re.match() checks for a match only at the beginning of the string, while re.search() checks for a match anywhere in the string 

In [84]:
text = "Headquarters: Toronto\nRevenue: 8.747 billion CAD (2017)\nNumber of employees: Over 13,000 (2017)"

In [85]:
print(text)

Headquarters: Toronto
Revenue: 8.747 billion CAD (2017)
Number of employees: Over 13,000 (2017)


In [95]:
regex = re.compile('\d+[\.,]{0,1}\d+')

Result 1

In [96]:
search = regex.search(text)
print('Starting Position: ', search.start())
print('Ending Position: ', search.end())
print(text[search.start():search.end()])

Starting Position:  31
Ending Position:  36
8.747


In [97]:
search = regex.search(text, 32)
print('Starting Position: ', search.start())
print('Ending Position: ', search.end())
print(text[search.start():search.end()])

Starting Position:  33
Ending Position:  36
747


In [98]:
search = regex.search(text, 36)
print('Starting Position: ', search.start())
print('Ending Position: ', search.end())
print(text[search.start():search.end()])

Starting Position:  50
Ending Position:  54
2017


In [101]:
regex = re.compile('\d+[\.,]{0,1}\d+')
search = regex.match(text)
print(search)

None


In [104]:
regex = re.compile('H')
search = regex.match(text)
print(search)

<re.Match object; span=(0, 1), match='H'>


## Groups

In [105]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""

In [106]:
print(text)

101   COM   Computers
205   MAT   Mathematics
189   ENG    English


In [111]:
pattern = r'([0-9]+)\s*([A-Z]{3})\s*([A-Za-z]{4,})'
regex = re.compile(pattern)
regex.findall(text)

[('101', 'COM', 'Computers'),
 ('205', 'MAT', 'Mathematics'),
 ('189', 'ENG', 'English')]

In [153]:
file_list = "one.txt two.zip three.doc"

In [162]:
pattern = r'(\.\w+)'
regex = re.compile(pattern)
regex.findall(file_list)

['.txt', '.zip', '.doc']

## Lookahead 

In [166]:
text = "100 dollars 200 coins"
# after the match
pattern = r"\d+(?=\sdollars)" # the engine matches the digits, then asserts what immediately 
                              # follows is the characters "\sdollars"  
regex = re.compile(pattern)
regex.findall(text)

['100']

In [180]:
pattern = r"(?=\d+\sdollars)\d+"
# before the match
regex = re.compile(pattern) # If the assertion succeeds, the engine matches the digits with \d+.
regex.findall(text)

['100']

## Negative Lookahead

In [181]:
# after the match
pattern = r"\d+(?!\d|\sdollars)" # asserts that at that position in the string, what immediately follows 
                                # is neither a digit nor the characters " dollars"
regex = re.compile(pattern)
regex.findall(text)

['200']

In [182]:
# before the match
pattern = r"(?!\d+ dollars)\d+"  # asserts that at the current position in the string, what follows is not digits 
                                # then the characters " dollars".
regex = re.compile(pattern)
regex.findall(text)

['200']

## Lookbehind

In [258]:
text = "Claim number: 1234567890 Policy number: 0123456789"

In [240]:
# before the match
pattern = r"(?<=Claim\snumber\:)\s*\d{10}"
regex = re.compile(pattern)    #asserts that at the current position in the string, what precedes is the Claim number:"
regex.findall(text)

[' 1234567890']

In [259]:
# after the match
pattern = r"\d{10}(?<=Claim\snumber\:\s\d{10})"
regex = re.compile(pattern)
regex.findall(text)

['1234567890']

## Negative Lookbehind

In [274]:
# before the match
pattern = r"(?<!Policy\snumber\:)\s\d{10}"
regex = re.compile(pattern)
regex.findall(text)

[' 1234567890']

In [267]:
# after the match
pattern = r"\d{10}(?<!Policy\snumber\:\s\d{10})"
regex = re.compile(pattern)
regex.findall(text)

['1234567890']

### Non-Capturing Groups

In [296]:
text = "Bob chloe Edgar"
pattern = "(?i:Bob|Chloe)"
regex = re.compile(pattern)
regex.findall(text)

['Bob', 'chloe']

In [281]:
text = "Bob says: GO"
pattern = "(?:Bob says: (\w+))"
regex = re.compile(pattern)
regex.findall(text)

['GO']

In [298]:
text = "Claim number: 1234567890 Policy number: 0123456789"
pattern = "(?:Claim number: (\w+))"
regex = re.compile(pattern)
regex.findall(text)

['1234567890']

In [303]:
text = "claim number: 1234567890 Policy number: 0123456789"
pattern = "(?-i:Claim number: (\w+))"
regex = re.compile(pattern)
regex.findall(text)

[]

In [None]:
text = "A00 11B C22D"
pattern = "(?|A(\d+)|(\d+)B|C(\d+)D)"
regex = re.compile(pattern)
regex.findall(text)

### References:

[Regex](https://pypi.org/project/regex/)