### Advanced Regex

In [5]:
import re

#### Repetition: searching for basic patterns

In [62]:
text = 'That dog is very fluffy.'

In [7]:
pattern = 'f'
# Look for all occurrences of pattern and create a list with them: 
re.findall(pattern,text)


['f', 'f', 'f']

In [8]:
# Look for first occurrence of pattern: 
re.search(pattern,text)


<re.Match object; span=(17, 18), match='f'>

In [22]:
# Look if pattern is in the beginning of text:
re.match(pattern,text)
# No result: No


In [20]:
# Look if text starts with uppercase letter:
re.match('[A-Z]',text)

<re.Match object; span=(0, 1), match='T'>

In [26]:
# Check entire pattern in text
pattern = 'uf'
re.findall(pattern,text)

['uf']

In [25]:
# Check for set of characters in text
pattern = '[uf]'
re.findall(pattern,text)

['f', 'u', 'f', 'f']

### [ ] Match set of characters

In [39]:
text2 = 'Is it spelled gray, gr2y, grey, graey or gruy?'
pattern2 = 'gr[aeiou]y'

re.findall(pattern2,text2)

['gray', 'grey', 'gruy']

In [43]:
text3 = 'This is an A and B conversation, so C your way out of it you D!'

re.findall('[A-C]',text3)

['A', 'B', 'C']

### Match any character except newline (\n)

In [58]:
pattern = '.' ## This dot means every character except newline.
text4 = 'I\'m so excited, \nAnd I can\'t get started! \nI got 370 bills I gotta pay'
print(text4)
print(re.findall(pattern,text4))

I'm so excited, 
And I can't get started! 
I got 370 bills I gotta pay
['I', "'", 'm', ' ', 's', 'o', ' ', 'e', 'x', 'c', 'i', 't', 'e', 'd', ',', ' ', 'A', 'n', 'd', ' ', 'I', ' ', 'c', 'a', 'n', "'", 't', ' ', 'g', 'e', 't', ' ', 's', 't', 'a', 'r', 't', 'e', 'd', '!', ' ', 'I', ' ', 'g', 'o', 't', ' ', '3', '7', '0', ' ', 'b', 'i', 'l', 'l', 's', ' ', 'I', ' ', 'g', 'o', 't', 't', 'a', ' ', 'p', 'a', 'y']


### Match character not listed if within set OR match beginning of line

In [60]:
pattern = '[^d-z]'
print(re.findall(pattern, text4))

['I', "'", ' ', ' ', 'c', ',', ' ', '\n', 'A', ' ', 'I', ' ', 'c', 'a', "'", ' ', ' ', 'a', '!', ' ', '\n', 'I', ' ', ' ', '3', '7', '0', ' ', 'b', ' ', 'I', ' ', 'a', ' ', 'a']


In [61]:
pattern = '[^a-z0-9! ]'
print(re.findall(pattern, text4))

['I', "'", ',', '\n', 'A', 'I', "'", '\n', 'I', 'I']


### $ Match end of line

In [77]:
pattern = 'That dog$'
text6 = 'That dog is very fluffy. That dog'

print(re.findall(pattern,text6))

['That dog']


In [76]:
pattern = '^fluffy.'
text7 = 'That dog is very fluffy.\n That dog'

print(re.findall(pattern,text7))

[]


In [75]:
pattern = 'fluffy.$'
text8 = 'That dog is very fluffy.\nThat dog'

print(re.findall(pattern,text8))

[]


In [78]:
pattern = 'fluffy|dog'
text9 = 'That dog is very fluffy.\nThat dog'

print(re.findall(pattern,text9))

['dog', 'fluffy', 'dog']


### * Matches previous character 0 or more times

In [85]:
dogtext = 'dog doog doooooooog ddddddogggg dddddooooogggg dddoogg deeeaggg diiig dooogggg dug'
pattern = 'd*o*g' ## matching d 0 or more times
print(re.findall(pattern,dogtext))


['dog', 'doog', 'doooooooog', 'ddddddog', 'g', 'g', 'g', 'dddddooooog', 'g', 'g', 'g', 'dddoog', 'g', 'g', 'g', 'g', 'g', 'dooog', 'g', 'g', 'g', 'g']


### + Matches previous character 1 or more times

In [90]:
pattern = 'd+o+g'
print(re.findall(pattern,dogtext))

['dog', 'doog', 'doooooooog', 'ddddddog', 'dddddooooog', 'dddoog', 'dooog']


### ? Matches previous character 0 or 1 times (optional)

In [92]:
pattern = 'd?og'
print(re.findall(pattern,dogtext))

['dog', 'og', 'og', 'dog', 'og', 'og', 'og']


### {num1,num2} Matches previous characters num1-num2 times

In [107]:
pattern = 'd{2,3}og'
print(re.findall(pattern,dogtext))

['dddog']


### Challenge: Extract all the words from the following sentence:

In [104]:
cattext = 'Every cat has seven lives.'
pattern = '[\w]+' ## \w gives every alphanumeric character.
re.findall(pattern,cattext)

['Every', 'cat', 'has', 'seven', 'lives']

### Now extract only words that have at least 4 letters

In [109]:
pattern = '[\w]{4,}' ## \w gives every alphanumeric character.
re.findall(pattern,cattext)

['Every', 'seven', 'lives']

### Extract all the phone numbers from the following text

In [112]:
phonebook = '''
Aeromexico 806-011-1102
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-061-0407
'''


In [117]:
pattern = '[\d]+-[\d]+-[\d]+'
re.findall(pattern,phonebook)

['806-011-1102',
 '888-247-2262',
 '888-247-2262',
 '800-567-6567',
 '800-361-2965',
 '800-061-0407']