# Regular Expression

In [1]:
text = "The phone number of the agent is 408-555-1234. Call Soon!"

In [2]:
"phone" in text

True

In [3]:
"408-555-1234" in text

True

In [4]:
import re

In [5]:
pattern = "phone"

In [6]:
re.search(pattern,text)

<re.Match object; span=(4, 9), match='phone'>

In [7]:
my_match = re.search(pattern, text)

In [8]:
my_match

<re.Match object; span=(4, 9), match='phone'>

In [9]:
my_match.span()

(4, 9)

In [10]:
my_match.start()

4

In [11]:
my_match.end()

9

In [21]:
text = "my phone is a 777-555-1234 phone"

In [22]:
match = re.search(pattern, text)

In [23]:
match.span()

(3, 8)

In [24]:
all_matches = re.findall("phone", text)

In [25]:
all_matches

['phone', 'phone']

In [26]:
len(all_matches)

2

In [27]:
for match in re.finditer("phone", text):
    print(match.span())

(3, 8)
(27, 32)


## Identifiers for Characters in Patterns

Characters such as a digit or a single string have different codes that represent them. You can use these to build up a pattern string. Notice how these make heavy use of the backwards slash \ . Because of this when defining a pattern string for regular expression we use the format:

    r'mypattern'
    
placing the r in front of the string allows python to understand that the \ in the pattern string are not meant to be escape slashes.

Below you can find a table of all the possible identifiers:

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [28]:
text

'my phone is a 777-555-1234 phone'

In [32]:
# pattern = r'\d{3}-\d{3}\d{4}'
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'


In [33]:
phone_number = re.search(pattern,text)

In [34]:
phone_number

<re.Match object; span=(14, 26), match='777-555-1234'>

In [35]:
phone_number.group()

'777-555-1234'

## Quantifiers

Now that we know the special character designations, we can use them along with quantifiers to define how many we expect.

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [62]:
phone_pattern = r"(\d{3})-(\d{3})-(\d{4})"

In [63]:
phone_result = re.search(phone_pattern,text)

In [64]:
phone_result.group()

'777-555-1234'

In [67]:
phone_result.group(3)

'1234'

In [68]:
re.search(r"man|woman", "This man was here")

<re.Match object; span=(5, 8), match='man'>

In [69]:
# Wild card character

In [70]:
re.findall(r".at", "The cat in the hat aplat")

['cat', 'hat', 'lat']

In [71]:
re.findall(r"..at", "The cat in the hat aplat")

[' cat', ' hat', 'plat']

- ^ -> stats with
- $ -> ends with

In [72]:
re.findall(r"\d$", "This ends with a number 2")

['2']

In [73]:
re.findall(r"^\d", "1 is the loneliest number")

['1']

In [74]:
phrase = "There are 3 numbers 34 inside 5 this sentence"

In [76]:
# if [^] if its inside the square bracket it exclude the digits
re.findall(r'[^\d]',phrase)


['T',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [77]:
# if wantd to get all togeter need to add + at the end
re.findall(r'[^\d]+',phrase)


['There are ', ' numbers ', ' inside ', ' this sentence']

### Remove punctuation from sentence

In [78]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [79]:
myList = re.findall(r"[^!.? ]+", test_phrase)

In [80]:
myList

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [81]:
' '.join(myList)

'This is a string But it has punctuation How can we remove it'

Brackets for grouping

In [83]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [84]:
re.findall(r"[\w]+-[\w]+", text)

['hypen-words', 'long-ish']