## Regular Expressions

In [2]:
# Documentation:: https://docs.python.org/3/library/re.html
import re

In [3]:
pattern = r'is my number'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Search anywhere in the string for a match
match

<_sre.SRE_Match object; span=(5, 17), match='is my number'>

In [6]:
pattern = r'^is my number'  # ^ matches the beginning of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
bool(match)

False

In [8]:
pattern = r'ring.$'  # $ matches the beginning of the string
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
bool(match)

True

In [9]:
pattern = r'elephant'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
print(match)  # None is retruned if no match is found

None


In [11]:
# * matches zero or more of the preceding characters
pattern = r'GCA*'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)  # NB. search will return the first matching string
bool(match)

True

In [None]:
# + matches 1 or more of the preceding characters
pattern = r'GCA+'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

In [15]:
# {n} matches n of the preceding characters
pattern = r'GCA{3}'
string = 'ATGCGCATTTTGCAAAGATTTCCAAGAGAGTTT'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(11, 16), match='GCAAA'>

In [14]:
# [] match characters in set of characters.
pattern = r'[a-zA-Z ]+'  # Match several characters at a time
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
bool(match)

True

In [20]:
pattern = r'[0-9]+'
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 28), match='1114567890'>

In [21]:
pattern = r'[0-9]*'  
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 0), match=''>

In [22]:
pattern = r'[a-zA-Z ]*' 
# NB. If re.search can not immediately start matching AND matching an empty string is an option, 
# re.search will match the empty string.
string = '111-456-7890 This is my number... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 0), match=''>

In [28]:
pattern = r'\w+'  # \w is a short hand for [a-zA-Z0-9_]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [24]:
pattern = r'\W+'  # \W is a short hand for the inverse of \w [^a-zA-Z0-9_]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)  # Matches first empty space
match  

<_sre.SRE_Match object; span=(4, 5), match=' '>

In [50]:
pattern = r'[\d+-]+'  # \d is a short hand for [0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(18, 30), match='111-456-7890'>

In [26]:
pattern = r'\D+'  # \D is a short hand for the inverse of \d [^0-9]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 18), match='This is my number '>

In [32]:
pattern = r'\s+'  # \s is a short hand for whitespace [ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(4, 5), match=' '>

In [35]:
pattern = r'\S+'  # \S is a short hand for the inverse of \s [^ \t\n\r\f\v]
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [42]:
# Find all tokens in string...
pattern = r'\S+'  # A token is a non-empty string of one or more characters
string = 'This is my number 111-456-7890... Gimme a ring.'
re.findall(pattern, string)

['This', 'is', 'my', 'number', '111-456-7890...', 'Gimme', 'a', 'ring.']

In [38]:
print('\\\\\\\\\\\\')

\\\\\\


##### Exercises

1. Write a regex to find all groups of numbers in the following string:
    - 'This is my number 111-456-7890... Gimme a ring.'

    
2. Write an expression to find the number of As followed by Bs
    - 'ABBBABABBBABBABABAAABABABAAAABABABABABAABABABABAB'

In [55]:
pattern = r'\d+'
string = 'This is my number 111-456-7890... Gimme a ring.'
re.findall(pattern, string)

['111', '456', '7890']

In [57]:
pattern = r'AB'
string = 'ABBBABABBBABBABABAAABABABAAAABABABABABAABABABABAB'
len(re.findall(pattern, string))

19

In [62]:
pattern = r'(\d{3})-(\d{3})-(\d{4})'  # () creates a group
string = 'This is my number 111-456-7890... Gimme a ring.'
match = re.search(pattern, string)
match
# match.groups()

<_sre.SRE_Match object; span=(18, 30), match='111-456-7890'>

In [63]:
match.group(1)  # Can access groups by number by default

'111'

In [64]:
# How will we parse out phone numbers in several different formats?
PHONE_NUMBERS = """
111-456-7890
2228901234
333.456.3847
333..456.3847
(444) 456-7890
"""

In [65]:
def find_numbers(pattern, numbers):
    for line in numbers.strip().split('\n'):
        number_line = line.strip()
        # just like search but only matches at beginning of string
        match = pattern.match(number_line)
        if match:
            print(number_line, '=>', match.groups())
        else:
            print(number_line, '=>', 'No match')

In [66]:
# We can complie a pattern for faster speed.
# We have to use the search and match methods on a compile pattern.
pattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$') 
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => No match
333.456.3847 => No match
333..456.3847 => No match
(444) 456-7890 => No match


In [67]:
# ? matches 0 or 1 instances of last character set
pattern = re.compile(r'^(\d{3})-?(\d{3})-?(\d{4})$')  
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => No match
333..456.3847 => No match
(444) 456-7890 => No match


In [68]:
# \D matches a non-integer character. * match 0 or more instances of last character set.
pattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})$') 
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => ('333', '456', '3847')
333..456.3847 => ('333', '456', '3847')
(444) 456-7890 => No match


In [69]:
# Putting it all together.
pattern = re.compile(r'^\D*(\d{3})\D*(\d{3})\D*(\d{4})$')
find_numbers(pattern, PHONE_NUMBERS)

111-456-7890 => ('111', '456', '7890')
2228901234 => ('222', '890', '1234')
333.456.3847 => ('333', '456', '3847')
333..456.3847 => ('333', '456', '3847')
(444) 456-7890 => ('444', '456', '7890')


In [70]:
pattern = r'^(1)?-?(\w{3})-(\w{3})-(\w{4})$'
string = '1-800-kid-CARS'
match = re.match(pattern, string)
match.groups()

('1', '800', 'kid', 'CARS')

In [None]:
# Match a mid-west name
pattern = r'^([A-Za-z-]+)$'
string = 'Mary-Jo'

match = re.match(pattern, string)
if match:
    data = match.groups()
else:
    data = ()

print(data)

In [71]:
# Named groups
pattern = r'^(?P<first_name>[A-Za-z-]+) (?P<last_name>[A-Za-z-]+)$'
string = 'Mary-Jo Lue-ellen'
match = re.match(pattern, string)
print(match.groups())
print(match.group('first_name'))
print(match.group('last_name'))

('Mary-Jo', 'Lue-ellen')
Mary-Jo
Lue-ellen


In [None]:
# Flags
pattern = r'''
    ^                        # match beginning of string
    (?P<first_name>[A-Z-]+)  # match first name
    \s+                      # match space between names
    (?P<last_name>[A-Z-]+)   # match last name
    $                        # match end of string
'''

string = 'Mary-Jo Lue-ellen'
flags = (
    re.IGNORECASE |  # Match against upper and lower case with one case
    re.VERBOSE  # Match with comments
)
match = re.match(pattern, string, flags=flags)

print(match.groups())
print(match.group('first_name'))
print(match.group('last_name'))