In [1]:
#Import necessary modules
import re

## Raw strings
Strings prefixed with an *r*, which tells python not to handle back slices in any special way

In [2]:
print ('\tTab')

	Tab


In [3]:
print (r'\tTab')

\tTab


In [4]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

Use `compile` to assign a pattern

Use `finditer` to find a pattern in a text. Returns an iterator which contains all of the matches

In [5]:
pattern = re.compile(r'abc')

print(pattern)

re.compile('abc')


In [6]:
matches = pattern.finditer(text_to_search)
print (matches)

<callable_iterator object at 0x7fd7a157fdc0>


In [7]:
for match in matches:
    print (match)
    print (match.group(0))

<re.Match object; span=(1, 4), match='abc'>
abc


`span` is the beginning and the end of the match

In [8]:
print (text_to_search[1:4])

abc


## Special characters in regular expression

These can be scaped using \ before the special characters

Character | Use
-----|-----
'.' | Any character except new line
'\d' | Any digit between (0,9)
'\D'| Anything that is NOT a digit
'\w'| word character (a-z,A-Z,0-9,_ )
'\W'| NOT a word character (a-z,A-Z,0-9,_ )
'\s'| Whitespace (space, tab, newline)
'\S'| NOT whitespace (space, tab, newline)
'\b'| Word boundary (start of the line, white space)
'\B'| NOT Word boundary (start of the line, white space)
'\B'| NOT Word boundary (start of the line, white space)
'^'| Beginning of a string
'$'| End of a string
'*'| 0 or more
'+'| 1 or more
'?'| 0 or one
{3}| Exactly 3
{3,4}| Between 3 and 4


In [9]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print (match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


## Character sets

Character sets are specified within [] and imply a set of possible characters. Within a character set, '.' is no longer a special character. Typical combinations include [a-zA-Z0-9]. Inside a character set, the symbol '^' NEGATES everything within a character set

In [10]:
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print (match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [11]:
text = '''
mat
cat
bat
'''

pattern = re.compile(r'[^b]at')
matches = pattern.finditer(text)
for match in matches:
    print (match)

<re.Match object; span=(1, 4), match='mat'>
<re.Match object; span=(5, 8), match='cat'>


## Quantifier special characters

In [12]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.finditer(text_to_search)
for match in matches:
    print (match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [14]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w+')
matches = pattern.finditer(text_to_search)
for match in matches:
    print (match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>


## Groups

Groups are written within (). You can build a group with different elements like (a|b|c). You can make all elements within a group present or not (www\\.)?

In [15]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w+')
matches = pattern.finditer(text_to_search)
for match in matches:
    print (match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>


In [18]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)')
matches = pattern.finditer(emails)
for match in matches:
    print (match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [19]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''
pattern = re.compile(r'https?://(www\.)?\w+.(com|gov)')
matches = pattern.finditer(urls)
for match in matches:
    print (match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


#### Groups are stored within a match and you can access them with the method `group`

In [20]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print (match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [22]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print (match.group(0)) # Group 0 corresponds to the full string
    print (match.group(1)) # Group 0 corresponds to the full string

https://www.google.com
www.
http://coreyms.com
None
https://youtube.com
None
https://www.nasa.gov
www.


## Substitutions: Back reference

These can be performed using the `sub` method combined with the groups

In [28]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
subbed_urls = pattern.sub(r'\2\3',urls) # Replace urls with groups 2 and 3 in pattern
print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



## Other methods

`findall` returns the matches as a list of strings. If it is matching groups it will only match the groups as a list of tuples

In [26]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s([A-Z]\w)+')
matches = pattern.findall(text_to_search)
print(matches)

[('Mr', 'Sc'), ('Mr', 'Sm'), ('Ms', 'Da'), ('Mrs', 'Ro')]


In [27]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.findall(text_to_search)
print (matches)

['321-555-4321', '123.555.1234', '123*555*1234', '800-555-1234', '900-555-1234']


`match` and `search` return only the first match

## Flags

There are several flags. One example is in the re.IGNORECASE, or re.I, which ignores capital or non capital letters

In [33]:
sentence = 'Start a sentence and then bring it to an end'
pattern = re.compile(r'^start[\s\w]+(end)$', re.I)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 44), match='Start a sentence and then bring it to an end'>


## More on groups

Each sub-pattern inside a parenthesis will be captured as a group. We can specify as many groups as we wish within a parenthesis

In [37]:
target_string = "The price of PINEAPPLE ice cream is 20"
pattern = re.compile(r'(\b[A-Z]+\b).+(\b\d+)$')
matches = pattern.finditer(target_string)
for match in matches:
    print (match)
    print (match.group(1))
    print (match.group(2))

<re.Match object; span=(13, 38), match='PINEAPPLE ice cream is 20'>
PINEAPPLE
20


In [43]:
target_string = "The price of ice-creams PINEAPPLE 20 MANGO 30 CHOCOLATE 40"
pattern = re.compile(r'(\b[A-Z]+\b).(\b\d+\b)')
matches = pattern.finditer(target_string)
for match in matches:
    print (match)
    print (match.group(1,2)) # This returns a tuple with the first and the second group for each match

<re.Match object; span=(24, 36), match='PINEAPPLE 20'>
('PINEAPPLE', '20')
<re.Match object; span=(37, 45), match='MANGO 30'>
('MANGO', '30')
<re.Match object; span=(46, 58), match='CHOCOLATE 40'>
('CHOCOLATE', '40')


#### Nested groups

Nested groups refer to multiple layers of information. The results of the captured groups are in the order in which they are defined (in order by open parenthesis)

In [45]:
text = '''
Jan 1987
May 1969
Aug 2011
'''
pattern = re.compile(r'([A-Z]\w\w\s(\d{4}))')
matches = pattern.finditer(text)
for match in matches:
    print (match.group(1))
    print (match.group(2))

Jan 1987
1987
May 1969
1969
Aug 2011
2011
