## Pattern Matching

In [6]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [7]:
'phone' in text

True

#### Single string match

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(3, 8), match='phone'>

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
re.search(pattern,text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern,text)

In [10]:
match

<re.Match object; span=(3, 8), match='phone'>

In [11]:
match.span()

(3, 8)

In [12]:
match.start()

3

In [13]:
match.end()

8

#### Multiple String match

In [14]:
# it only returns first match in case of multiple matches
text = 'my phone once, my phone twice'

In [15]:
match = re.search('phone',text)

In [16]:
match

<re.Match object; span=(3, 8), match='phone'>

In [17]:
# to find number of multiple matches
matches = re.findall('phone',text)

In [18]:
matches

['phone', 'phone']

In [19]:
# to find about multiple matches
for match in re.finditer('phone',text):
    print(match)
    print(match.group())     # for actual text

<re.Match object; span=(3, 8), match='phone'>
phone
<re.Match object; span=(18, 23), match='phone'>
phone


#### Searching patterns using Identifiers & Quantifiers

In [20]:
text = 'My phone number is 408-555-1234'

In [21]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [22]:
phone 

<re.Match object; span=(19, 31), match='408-555-1234'>

In [23]:
phone.group()

'408-555-1234'

In [25]:
# Using Quantifiers too

In [26]:
phone =  re.search(r'\d{3}-\d{3}-\d{4}',text)

In [27]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [33]:
# to be able to get a small part of the group, we group them in small groups and compile them back together
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [30]:
results = re.search(phone_pattern,text)

In [31]:
results.group()

'408-555-1234'

In [32]:
results.group(1)

'408'

In [35]:
results.group(3)

'1234'

### Additional Regex syntax

In [37]:
# OR operator(|) to search more than two things
re.search(r'cat|dog','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [38]:
# to grab other characters before or after the search string - wild card character
re.findall(r'...at','The cat in the hat went splat.')

['e cat', 'e hat', 'splat']

In [48]:
# to get if the string starts with a number(can be used with other identifiers too)
re.findall(r'^\d','1 is a number')

['1']

In [50]:
# to get if the string ends with a number(can be used with other identifiers too)
re.findall(r'\d$','1 is a number and 2')

['2']

In [62]:
# to exclude anything from the result - Exclusion syntax

In [51]:
phrase = 'there are 3 numbers 3 inside 5 this sentence'

In [52]:
pattern = r'[^\d]'

In [53]:
re.findall(pattern,phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [55]:
pattern = r'[^\d]+'

In [56]:
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [58]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [60]:
clean = re.findall(r'[^!.? ]+',test_phrase)

In [61]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [67]:
# Inclusion syntax

In [64]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are.'

In [65]:
pattern = r'[\w]+-[\w]+'

In [66]:
re.findall(pattern,text)

['hypen-words', 'long-ish']

In [71]:
text = 'Hello, would you like some catfish?'
text2 = 'Hello, would you like to take  a catnap?'
text3 = 'Hello, have you seen this caterpillar?'

In [74]:
re.search(r'cat(fish|nap|erpillar)',text2)

<re.Match object; span=(33, 39), match='catnap'>