# Regular Expressions


**Match A Symbol**


In [1]:
# Load regex package
import re

In [2]:
# Create a variable containing a text string
text = "I have $5 and i will invest $3"

In [3]:
## Find all instances of the exact match '$'
re.findall(r'\$',text)

['$', '$']

**Match A Unicode Character**

In [4]:
# Load regex package
import re

In [5]:
# Create a variable containing a text string
text = 'Microsoft™.'


In [6]:
# # Find any unicode character for a trademark
re.findall(r'\u2122',text)

['™']

**Match A Word**

In [7]:
# Load regex package
import re

In [8]:
# Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy is on us brown bear.'

In [9]:
# Find any word of three letters
re.findall(r'\b...\b',text)

['The', 'fox', 'the', ' is', ' on', ' us']

In [10]:
# Find any word of three letters
re.findall(r'\b..\b',text)

['is', 'on', 'us']

**Match Any Character**

In [11]:
# Load regex package
import re

In [12]:
# Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy brown bear.'

In [13]:
# Find anything with a 'T' and then the next two characters
re.findall(r'T..',text)

['The']

In [14]:
re.findall(r'f..',text)

['fox']

In [15]:
re.findall(r'b....',text)

['brown', 'brown', 'bear.']

**Match Any Of A List Of Characters**


In [16]:
# Load regex package
import re

In [17]:
# Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy brown bear.'

In [18]:
# Find all instances of any vowel
re.findall(r'[aeiou]',text)

['e', 'u', 'i', 'o', 'o', 'u', 'e', 'o', 'e', 'e', 'a', 'o', 'e', 'a']

**Match Any Of A Series Of Options**


In [19]:
# Load regex package
import re

In [20]:
#Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy brown bear.'

In [21]:
# Find any of fox, snake, or bear
re.findall(r'fox|snake|bear',text)

['fox', 'bear']

**Match Any Of A Series Of Words**


In [22]:
# Load regex package
import re

In [23]:
# Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy brown bear.'

In [24]:
# Find any of fox, snake, or bear
re.findall(r'\b(fox|snake|bear)\b',text)

['fox', 'bear']

**Match Dates**


In [25]:
# Load regex package
import re

In [26]:
# Create a variable containing a text string
text = 'My birthday is 09/15/1983. My brother\'s birthday is 01/01/01. My other two brothers have birthdays of 9/3/2001 and 09/1/83.'


In [27]:
# Find any text that fits the regex
re.findall(r'\b[0-3]?[0-9]/[0-3]?[0-9]/(?:[0-9]{2})?[0-9]{2}\b', text)

['09/15/1983', '01/01/01', '9/3/2001', '09/1/83']

**Match Email Addresses**


In [28]:
# Load regex package
import re

In [29]:
# Create a variable containing a text string
text =  'My email is ganesh@hotmail.com, thanks! No, I am at bob@data.ninja.'

In [30]:
# Find all email addresses
re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9]+', text)

# Explanation:
# This regex has three parts
# [a-zA-Z0-9_.+-]+ Matches a word (the username) of any length
# @[a-zA-Z0-9-]+  Matches a word (the domain name) of any length
# \.[a-zA-Z0-9-.]+ Matches a word (the TLD) of any length

['ganesh@hotmail.com', 'bob@data.ninja']

**Match Exact Text**


In [31]:
# Load regex package
import re

In [32]:
# Create a variable containing a text string
text = 'The quick brown fox jumped over the lazy brown bear.'

In [33]:
# Find all instances of the exact match 'The'
re.findall(r'The', text)

['The']

**Match Integers Of Any Length**



In [34]:
# Load regex package
import re

In [35]:
# Create a variable containing a text string
text = '21 scouts and 3 tanks fought against 4,003 protestors.'

In [36]:
# Find any character block that is a integer of any length
re.findall(r'[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0', text)
'''[1-9](?:\d{0,2}) #A sequence of 1-3 numerals not starting with 0
(?:,\d{3})*      #Any number of three-digit groups, each preceded by a comma
(?:\.\d*[1-9])?  #Optionally, a decimal point followed by any number of digits not ending in 0
|                #OR...
0?\.\d*[1-9]     #Only the decimal portion, optionally preceded by a 0
|                #OR...
0                #Zero.'''

'[1-9](?:\\d{0,2}) #A sequence of 1-3 numerals not starting with 0\n(?:,\\d{3})*      #Any number of three-digit groups, each preceded by a comma\n(?:\\.\\d*[1-9])?  #Optionally, a decimal point followed by any number of digits not ending in 0\n|                #OR...\n0?\\.\\d*[1-9]     #Only the decimal portion, optionally preceded by a 0\n|                #OR...\n0                #Zero.'

**Match Text Between HTML Tags**


In [37]:
# Load regex package
import re

In [38]:
# Create a variable containing a text string
text = '<p>The quick brown fox.</p><p>The lazy brown bear.</p>'


In [39]:
# Find any text between '<p>' and '</p>'
re.findall(r'<p>(.*?)</p>',text)

['The quick brown fox.', 'The lazy brown bear.']

**Match Times**


In [40]:
# Load regex package
import re

In [41]:
# Create a variable containing a text string
text = 'Ganesh: 12:34am. Kasturi: 16:30'

In [42]:
# Find any text that fits the regex
re.findall(r'([0-1]\d:[0-5]\d)\s*(?:AM|PM)?', text)

['12:34', '16:30']

**Match URLs**


In [43]:
# Load regex package
import re

In [44]:
# Create a variable containing a text string
text = 'My blog is http://www.ganeshkasturi.com and not http://ganeshkasturi.com'

In [45]:
# Find any ISBN-10 or ISBN-13 number
re.findall(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', text)


[('http', 'www.ganeshkasturi.com', ''), ('http', 'ganeshkasturi.com', '')]

**Match US Phone Numbers**


In [46]:
# Load regex package
import re

In [47]:
# Create a variable containing a text string
text = 'My phone number is 415-333-3922. His phone number is 4239389283'

In [48]:
# Find any text that fits the regex
re.findall(r'\(?([2-9][0-8][0-9])\)?[-.●]?([2-9][0-9]{2})[-.●]?([0-9]{4})', text)

[('415', '333', '3922'), ('423', '938', '9283')]

**Match US and UK Spellings**


In [49]:
# Load regex package
import re

In [50]:
# Create a variable containing a text string
text = 'It\s center and not centre.'

In [51]:
# Find any ISBN-10 or ISBN-13 number
re.findall(r'\bcent(?:er|re)\b', text)

['center', 'centre']

**Match Words With A Certain Ending**


In [52]:
# Load regex package
import re

In [53]:
# Create a variable containing a text string
text = 'Capitalism, Communism, Neorealism, Liberalism'

In [54]:
# Find any word ending in 'ism'
re.findall(r'\b\w*ism\b', text)

# Specific:
# \b     - start of the word
# \w*    - a word of any length
# ism\b  - with 'ism'at the end

['Capitalism', 'Communism', 'Neorealism', 'Liberalism']

**Match ZIP Codes**


In [55]:
# Load regex package
import re

In [56]:
# Create a variable containing a text string
text = '3829 South Ave Street, Pheonix, AZ 34923'

In [57]:
# Find any ISBN-10 or ISBN-13 number
re.findall(r'[0-9]{5}(?:-[0-9]{4})?', text)

['34923']