## Regular Expressions

In [6]:
import re

In [17]:
text = "my phone number is 444-90-1234. phone number is 9 digits long"

In [4]:
'phone' in text

True

In [5]:
pattern = 'phone'

In [12]:
match = re.search(pattern,text)

In [13]:
match.span()

(3, 8)

In [14]:
text[3:8]

'phone'

In [15]:
match.start()

3

In [16]:
match.end()

8

In [20]:
matches = re.findall(pattern,text)

In [21]:
matches

['phone', 'phone']

In [22]:
len(matches)

2

In [23]:
for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(32, 37)


In [24]:
match.group()

'phone'

# Patterns

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [25]:
text2 = "my phone number is 444-90-1234. phone number is 9 digits long"

In [26]:
# not recommended
phone = re.search(r'\d\d\d-\d\d-\d\d\d\d',text)

In [28]:
phone.group()

'444-90-1234'

## Quantifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [29]:
# recommended
phone = re.search(r'\d{3}-\d{2}-\d{4}',text)

In [30]:
phone

<re.Match object; span=(19, 30), match='444-90-1234'>

## Groups

In [31]:
phone = re.search(r'(\d{3})-(\d{2})-(\d{4})',text)

In [32]:
phone

<re.Match object; span=(19, 30), match='444-90-1234'>

In [33]:
if phone.group(1) != '444':
    print("Please enter a valid country code")

'444'

## OR Operator

In [34]:
re.search(r"man|woman","This is a man.")

<re.Match object; span=(10, 13), match='man'>

In [35]:
re.search(r"man|woman","This is a woman.")

<re.Match object; span=(10, 15), match='woman'>

## Start with and End with

In [36]:
# start with
re.findall(r'^\d',"1 is the smallest number")

['1']

In [39]:
# end with
re.findall(r'\d$',"Three is 3")

['3']

## Exclusion

In [40]:
text3 = "1 is the smallest number. Three is 3"

In [42]:
re.findall(r'[^\d]+',text3)

[' is the smallest number. Three is ']

In [43]:
punctuation = "This is a string! But it has puntuation. How can we remove it?"

In [44]:
re.findall(r'[^!.?]+',punctuation)

['This is a string', ' But it has puntuation', ' How can we remove it']

In [45]:
clean = ''.join(re.findall(r'[^!.?]+',punctuation))

In [46]:
clean

'This is a string But it has puntuation How can we remove it'

In [49]:
text4 = "this is a hypen-text, 1234-000"

In [52]:
re.findall(r'[\w]+-[\w]+',text4)

['hypen-text', '1234-000']