<font size = 8> Python Regular Expressions Part Three

In [1]:
import re

***

In [2]:
# Searching for multiple patterns using the OR operator

In [4]:
re.search(r'cat', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [3]:
text = "This is my phone number in India 650-506-6062"

In [5]:
re.search(r'phone', text)

<re.Match object; span=(11, 16), match='phone'>

In [6]:
re.search(r'tele', text)

In [7]:
# note that the word tele did not return a result

***

## OR

In [9]:
re.search(r'phone | tele', text)

<re.Match object; span=(11, 17), match='phone '>

***

## Wildcard

In [16]:
text2 = "The cat in the hat sat there splat"

In [17]:
re.findall(r'at', text2)

['at', 'at', 'at', 'at']

In [12]:
# 3 mataches of at

In [13]:
# what if you wanted the whole word

In [14]:
# Use the wildcard operator (.)

In [18]:
re.findall(r'.at', text2)

['cat', 'hat', 'sat', 'lat']

In [19]:
# Add more wildcards as it is truncated to 2

In [20]:
re.findall(r'..at', text2)

[' cat', ' hat', ' sat', 'plat']

In [21]:
re.findall(r'....at', text2)

['he cat', 'he hat', ' splat']

In [22]:
# wildcard counts space as a character

***

## Starts with and Ends with

In [23]:
# STARTS WITH

re.findall(r'^\d', "1 is a number")

# ^\d indicates return a string starting iwth a number

['1']

In [24]:
# However this will not work for

re.findall(r'^\d', "This 2 is also a number")

[]

***

In [25]:
# ENDS WITH

In [26]:
re.findall(r'\d$', "here are dollars 2")

['2']

In [27]:
# NOTE THE $ symbol

In [28]:
# However, this will not work for

re.findall(r'\d$', "here are 2 dollars")

[]

***

## Excluding characters

In [29]:
# Use the ^ iwth brackets

In [32]:
phrase = 'there are 3 numbers 34 inside 5 this sentence'

# jusrt random numbers placed inside this sentence

# we need to exclude numbers

In [35]:
# First define the pattern (d for integers)

pattern1 = r'[^\d]'

In [36]:
re.findall(pattern1, phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [37]:
# this only return non numbers

In [39]:
# getting the words back together with the + identifier

![image.png](attachment:image.png)

In [40]:
# add the +

pattern2 = r'[^\d]+'

In [41]:
re.findall(pattern2, phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [42]:
# this is a useful way to get rid of punctuation from a sentence

***

In [44]:
phrase3 = 'This is a string! But this has punctuation. How can we remove it?'

In [47]:
# notice the new pattern

pattern3 = r'[^!.?]+'

re.findall(pattern3, phrase3)

# re.findall(r'[^!.?]+', phrase3)

['This is a string', ' But this has punctuation', ' How can we remove it']

In [48]:
# remove spaces

In [49]:
# adding space to the pattern for removal

pattern4 = r'[^!.? ]+'

re.findall(pattern4, phrase3)

['This',
 'is',
 'a',
 'string',
 'But',
 'this',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [50]:
# now we get a list of all the words

In [51]:
clean = re.findall(pattern4, phrase3)

In [52]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'this',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

***

## Joining words back together

In [53]:
' '.join(clean)

'This is a string But this has punctuation How can we remove it'

***

## Grouping things with [ ]

In [55]:
text5 = 'Only find the hyphen-words in this sentence. \
But you do not know how long-ish they are'

In [56]:
# find words that have a hyphen in the middle of it

In [57]:
pattern5 = r'[\w]+'

# this indicates a group of alphanumeric
# this excludes teh hyphen itself

![image.png](attachment:image.png)

In [58]:
re.findall(pattern5, text5)

['Only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

***

In [64]:
# adding two groups together separated by a hyphen

pattern6 = r'[\w]+-[\w]+'

# pattern6 = r'[\w]+ - [\w]+'

In [65]:
re.findall(pattern6,text5)

['hyphen-words', 'long-ish']

***

In [66]:
# You would get the same result if you remove the braces

In [67]:
pattern7 = r'\w+-\w+'

In [68]:
re.findall(pattern7,text5)

['hyphen-words', 'long-ish']

In [69]:
# braces are added primarily to improve readability

***

## Using paranthesis for multiple options

In [71]:
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [72]:
# all start with cat

In [73]:
re.search(r'cat(fish|nap|claw)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [74]:
re.search(r'cat(fish|nap|claw)', texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [75]:
re.search(r'cat(fish|nap|claw)', textthree)

In [76]:
# did not find anything

In [77]:
re.search(r'cat(fish|nap|erpillar)', textthree)

<re.Match object; span=(26, 37), match='caterpillar'>