# Chapter 7

## Pattern Matching with Regular Expressions


__Problem.__ Write a function to check for telephone number (return True or False)

In [2]:
def isPhoneNumber (text):
    
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

In [3]:
print('415-555-4242 is a phone number:')
print(isPhoneNumber('415-555-4242'))

415-555-4242 is a phone number:
True


In [4]:
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))

Moshi moshi is a phone number:
False



__Problem.__ Find telephone number in the text

In [5]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('The phone number was found: ' + chunk)
print('Done')

The phone number was found: 415-555-1011
The phone number was found: 415-555-9999
Done




## Regexes


### Regex Symbols

The ? matches zero or one of the preceding group.

The * matches zero or more of the preceding group.

The + matches one or more of the preceding group.

The {n} matches exactly n of the preceding group.

The {n,} matches n or more of the preceding group.

The {,m} matches 0 to m of the preceding group.

The {n,m} matches at least n and at most m of the preceding group.

{n,m}? or *? or +? performs a nongreedy match of the preceding group.

^spam means the string must begin with spam.

spam$ means the string must end with spam.

The . matches any character, except newline characters.

\d, \w, and \s match a digit, word, or space character, respectively.

\D, \W, and \S match anything except a digit, word, or space character, respectively.

[abc] matches any character between the brackets (such as a, b, or c).

[^abc] matches any character that isn’t between the brackets.


In [6]:
import re

# Creating a regex for the phone number
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# Looking up in the text for the phone number (return None or Match)
matchObject = phoneNumRegex.search(message)
# group() method returns the actual match
print(matchObject.group())

415-555-1011


In [7]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
matchObject = phoneNumRegex.search(message)
print(matchObject.group(1))
print(matchObject.group(2))

415
555-1011


In [8]:
print(matchObject.groups())

('415', '555-1011')


In [9]:
areaCode, mainNumber = matchObject.groups()
print(areaCode + '\n' + mainNumber)

415
555-1011


In [10]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
print(mo.groups())

('(415)', '555-4242')


In [11]:
# | = OR 
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
mo1.group()

'Batman'

In [12]:
mo2 = heroRegex.search('Tina Fey and Batman.')
mo2.group()

'Tina Fey'

In [13]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [14]:
# (wo)? optional part of the pattern

batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [15]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [16]:
# Optional area code in the phone number

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo = phoneRegex.search('My number is 415-555-4242')
mo.group()

'415-555-4242'

In [17]:
mo = phoneRegex.search('My number is 555-4242')
mo.group()

'555-4242'

In [18]:
# (wo)* = match zero or more times

batRegex = re.compile(r'Bat(wo)*man')
mo = batRegex.search('The Adventures of Batman')
mo.group()

'Batman'

In [19]:
mo = batRegex.search('The Adventures of Batwowowowoman')
mo.group()

'Batwowowowoman'

In [20]:
# (wo)+ = match at least one time

batRegex = re.compile(r'Bat(wo)+man')
mo = batRegex.search('The Adventures of Batman')
mo == None

True

In [21]:
mo = batRegex.search('The Adventures of Batwoman')
mo.group()

'Batwoman'

In [22]:
#(Ha){3} = matching with repetition

haRegex = re.compile(r'(Ha){3}')
mo = haRegex.search('HaHaHa')
mo.group()

'HaHaHa'

In [23]:
mo = haRegex.search('Ha')
mo == None

True

In [24]:
# Greedy and Non-Greedy matching

greedyHaRegex = re.compile(r'(Ha){3,5}')
mo = greedyHaRegex.search('HaHaHaHaHa')
mo.group()

'HaHaHaHaHa'

In [25]:
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo = nongreedyHaRegex.search('HaHaHaHaHa')
mo.group()

'HaHaHa'

In [26]:
# findall() method returns list of all matched results, while search() method returns Match Object

phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [None]:
# findall() returns list of tuples, if there are groups

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')


### Shorthand Codes for Common Character Classes

\d Any numeric digit from 0 to 9.

\D Any character that is not a numeric digit from 0 to 9.

\w Any letter, numeric digit, or the underscore character. (Think of this as matching “word” characters.)

\W Any character that is not a letter, numeric digit, or the underscore character.

\s Any space, tab, or newline character. (Think of this as matching “space” characters.)

\S Any character that is not a space, tab, or newline.

[0-5] (0|1|2|3|4|5)


In [28]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

### Making your own character class

In [30]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('Making Your Own Character Classes')

['a', 'i', 'o', 'u', 'O', 'a', 'a', 'e', 'a', 'e']

In [33]:
# ^ = negative character class (except these letters)
vowelRegex = re.compile(r'[^aeiouAEIOU]')
vowelRegex.findall('Making Your Own Character Classes')

['M',
 'k',
 'n',
 'g',
 ' ',
 'Y',
 'r',
 ' ',
 'w',
 'n',
 ' ',
 'C',
 'h',
 'r',
 'c',
 't',
 'r',
 ' ',
 'C',
 'l',
 's',
 's',
 's']

In [35]:
# Caret symbol ^ =  the match should occur at the begining of the string

beginsWithHello = re.compile(r'^Hello')
mo = beginsWithHello.search('Hello world!')
mo.group()

'Hello'

In [36]:
beginsWithHello.search('He said hello.') == None

True

In [38]:
endsWithNumber = re.compile(r'\d$')
mo = endsWithNumber.search('Your number is 42')
mo.group()

'2'

In [39]:
endsWithNumber.search('Your number is forty two.') == None

True

__Problem.__ Match the string that starts and ends with numeric characters

In [45]:
wholeStringIsNum = re.compile(r'^\d+$')
mo = wholeStringIsNum.search('1234567890')
mo.group()

'1234567890'

In [46]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [47]:
wholeStringIsNum.search('12 34567890') == None

True

In [48]:
# . = wildcard character (anything except newline)

atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [None]:
# (.*) = anything

nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(1)

In [52]:
mo.group(2)

'Sweigart'

In [53]:
# .*? = Shorter, non-greedy matching

nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [54]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

In [55]:
# Case insensitive matching: re.I or re.IGNORECASE

robocop = re.compile(r'robocop', re.I)
robocop.search('Robocop is part man, part machine, all cop.').group()

'Robocop'

In [56]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [59]:
# sub() method = substitute the match

namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

__Problem.__ Censor the names of the secret agents by showing just the first letters of their names

In [65]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'