## RegExpression Tutorial by Corey Schafer
- https://www.youtube.com/watch?v=K8L6KVGG-7o&t=638s

In [1]:
import re

In [73]:
text_to_search = '''
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

Metacharacters to be escaped:
. ^ $ * + { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

under_score

cat
mat
pat
bat
'''



### Demonstrate raw string 'r'

In [10]:
# Example to show how 'r' tells python to treat as raw string

print('\tTab'); print(r'\tTab')

	Tab
\tTab


### Literal Search with re.compile method

In [12]:
## Example using re.compile method
##   can resuse variable for multiple searches

pattern = re.compile(r'abc')  #searches for 'abc' in that order

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
## produces an object
##  span is beginning and end index of the match
##     so it found 1 match of 'abc' at index 1-4
##         this is useful so we can use these indexes later (below)

print(text_to_search[1:4])


## output will produce one line per finding

<re.Match object; span=(1, 4), match='abc'>
abc


In [14]:
## note that above pattern 'abc' will search for that error
##    so if we search cba, nothing will result

pattern2 = re.compile(r'cba')
matches2 = pattern2.finditer(text_to_search)

for match in matches2:
    print(match)

In [20]:
## Example using the period special character

pattern = re.compile(r'.')  # matches all characters (except new line)

matches = pattern.finditer(text_to_search)

#for match in matches:
#    print(match)

In [16]:
## now if we want to actually search for a period, 
##   must escape the period first

pattern = re.compile(r'\.')  #searches for 'abc' in that order

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
## output below is all the periods that appear in text_to_search

<re.Match object; span=(106, 107), match='.'>
<re.Match object; span=(140, 141), match='.'>
<re.Match object; span=(162, 163), match='.'>
<re.Match object; span=(166, 167), match='.'>
<re.Match object; span=(175, 176), match='.'>
<re.Match object; span=(206, 207), match='.'>
<re.Match object; span=(219, 220), match='.'>


In [17]:
## Example scaping the period in a string such as a URL

pattern = re.compile(r'coreyms\.com')  #searches for 'abc' in that order

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(133, 144), match='coreyms.com'>


### Pattern Searching

In [54]:
## '\d'  #searches anything that is a digit
## '\D'  #searches anything that is not a digit
## '\w'  #searches for word character (a-z, A-Z, 0-9, _): any letter, digit, underscore
## '\W'  #searched for anything not a word Character: no upper/lower letters, digits, underscores
## '\s'  #searches anything that is whitespace (space, tab, newline)
## '\S'  #searches anything that is not whitespace
## '\b'  #searches anything that is a word boundary (whitespace or non-alphanumeric character)
## '\B'  #searches anything that is not a word boundary

pattern = re.compile(r'\W')  #searches anything that is a digit

matches = pattern.finditer(text_to_search)

#for match in matches:
#    print(match)

### Word Boundary example

In [40]:
pattern = re.compile(r'\bHa')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
##  Ha HaHa
##  this matches the first Ha since it's start of a line
##    and the second Ha because there is a space between first
##    Ha and second.  Doesn't get 3rd Ha because its not preceded
##    by a word boundary

##  if you change pattern to pattern = re.compile(r'\bHa\b')
##   it will just get the first Ha since the pattern requires
##   that word boundary be on both sides

<re.Match object; span=(67, 69), match='Ha'>
<re.Match object; span=(72, 74), match='Ha'>


In [39]:
pattern = re.compile(r'\BHa')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
##  Ha HaHa
##  this matches the 3rd Ha since it's the Ha that isn't bounded
##    by a word boundary

<re.Match object; span=(72, 74), match='Ha'>


### Start of a String ^

In [43]:
sentence = 'Start a sentence and then bring it to an end'
pattern = re.compile(r'^Start')  # 'Start' has to be at beginning of string 
matches = pattern.finditer(sentence)

for match in matches:
    print(match)
    
## would produce nothing because 'a' is not at begining of string    
#pattern = re.compile(r'^a')  # 'a' has to be at beginning of string 

<re.Match object; span=(0, 5), match='Start'>


### End of the String $

In [45]:
## note the order: $ goes after the test of interest

sentence = 'Start a sentence and then bring it to an end'
pattern = re.compile(r'end$')  # 'end' has to be at end of string 
matches = pattern.finditer(sentence)

for match in matches:
    print(match)
    
## produces nothing because 'a' is not at end of string
#pattern = re.compile(r'a$')  # 'a' has to be at end of string

<re.Match object; span=(41, 44), match='end'>


### Match Number Pattern

In [57]:
## see text_to_search

# \d\d\d get string of 3 digits 
# the . searches for any character (catches -, ., *)
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(146, 158), match='321-555-4321'>
<re.Match object; span=(159, 171), match='123.555.1234'>
<re.Match object; span=(172, 184), match='123*555*1234'>


In [53]:
## example: external file of names applying regex on with open

path = 'D:\OneDrive - QJA\My Files\DataScience\DataSets'
filename = 'NamePhoneAddressEmail.txt'

with open(path + '\\' + filename, 'r') as f:
    contents = f.read()
    
    matches = pattern.finditer(contents)
    #for match in matches:
    #    print(match)

In [58]:
## use [] to specify characters
## see text_to_search

# \d\d\d get string of 3 digits 
# specify the characters between each digit segment (either a - or a .)
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')  # notice don't need to escape with the []
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
## notice it doesn't grab '123*555*1234' since * isn't specified

<re.Match object; span=(146, 158), match='321-555-4321'>
<re.Match object; span=(159, 171), match='123.555.1234'>


### Match pattern of specifid digits

In [61]:
## only get 800 or 900 numbers

pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')  # notice don't need to escape with the []
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(185, 197), match='800-555-1234'>
<re.Match object; span=(198, 210), match='900-555-1234'>


In [63]:
## same but using the external txt file to get all 800 and 900

pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')  # notice don't need to escape with the []

with open(path + '\\' + filename, 'r') as f:
    contents = f.read()
    
    matches = pattern.finditer(contents)
    for match in matches:
        print(match)

<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; span=(1794, 1806), match='800-555-7100'>
<re.Match object; span=(2055, 2067), match='900-555-5118'>
<re.Match object; span=(2830, 2842), match='900-555-5428'>
<re.Match object; span=(3290, 3302), match='800-555-8810'>
<re.Match object; span=(3977, 3989), match='900-555-9598'>
<re.Match object; span=(4951, 4963), match='800-555-2420'>
<re.Match object; span=(5572, 5584), match='900-555-3567'>
<re.Match object; span=(6195, 6207), match='800-555-3216'>
<re.Match object; span=(6897, 6909), match='900-555-7755'>
<re.Match object; span=(7872, 7884), match='800-555-1372'>
<re.Match object; span=(8751, 8763), match='900-555-6426'>


### Match Characters in a Range

In [None]:
## match digits between 1 and 5

pattern = re.compile(r'[1-5]')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

In [66]:
## match letters in a range

pattern = re.compile(r'[a-z]') # lower case
#pattern = re.compile(r'[a-zA-Z]') # upper and lower
matches = pattern.finditer(text_to_search)

#for match in matches:
#    print(match)

### Negate

In [69]:
## A ^ outside character set means "at the beginning" but inside
##   the character set means negate (get everything but...)
pattern = re.compile(r'[^a-zA-Z]') # upper and lower
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>


In [74]:
## cat, mat, pat, bat (from text_to_search)
## get everything that doesn't start with a 'b' but is followed by 'at'

pattern = re.compile(r'[^b]at') # upper and lower
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 279), match='cat'>
<re.Match object; span=(280, 283), match='mat'>
<re.Match object; span=(284, 287), match='pat'>


### Quantifier to Match Multiple Characters at 1 Time

In [79]:
## when we know exact number

## Instead of using pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
##   to get phone numbers (example above)

## match: digit with 3 sep by anaything, then digit of 3 sep by anything
##   then digit of 4
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(146, 158), match='321-555-4321'>
<re.Match object; span=(159, 171), match='123.555.1234'>
<re.Match object; span=(172, 184), match='123*555*1234'>
<re.Match object; span=(185, 197), match='800-555-1234'>
<re.Match object; span=(198, 210), match='900-555-1234'>


In [86]:
##  when we don't know the exact number of characters
## from text_to_search
## Mr. Schafer
## Mr Smith
## Ms Davis
## Mrs. Robinson
## Mr. T

## match string containing Mr and a period (the \ escapes the period)
pattern = re.compile(r'Mr\.')

# period after prefix optional(? zero or 1 period characters)
# the \ escapes the period so its not used as wildcard 'match any character'
pattern = re.compile(r'Mr\.?') 

# match up to 1st letter of last name (because we specified upper case)
# \s is space: so get capital letter after first space
pattern = re.compile(r'Mr\.?\s[A-Z]') 

# same as above but \w* = match zero or more word characters after first uppercase
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')

# create more specific pattern to capture Mr and Mrs
# use or piple (|) to specify characters of interest
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(212, 223), match='Mr. Schafer'>
<re.Match object; span=(224, 232), match='Mr Smith'>
<re.Match object; span=(233, 241), match='Ms Davis'>
<re.Match object; span=(242, 255), match='Mrs. Robinson'>
<re.Match object; span=(256, 261), match='Mr. T'>


### Combine Concepts from Above

In [99]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''


# get upper or lower case
# +@ match one or more of characters in the [] up until we hit @ symbol
# get one or more letters after the @ up until the .com
# will yeild the first email
pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.com')

# since above doesn't account for the period, need to include it after first char set
# also need to include the edu
pattern = re.compile(r'[a-zA-Z.]+@[a-zA-Z]+\.(com|edu)')

# above does't account for numbers and hyphen in 3rd email so need to account for that
# add 0-9 to account for number, and a - to account for hypens at begining and end
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)')

matches = pattern.finditer(emails) 

for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


## Select characters from a String

In [122]:
urls = '''
https://www.google.com
http://coryms.com
https://youtube.com
https://www.nasa.gov
'''

# ? after s tells the s is optional
# putting www. in a group () allows to apply ? to entire www making it optional since some don't have www
# \w+ match 1 or more word character up to the period
# \w+ get one or more characters (after the period)
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')

# since we are interested in the domain and high level domain, use
#    parenthesis create 4 different groups: 
#      1: http(s)   2: www. 3: domain name   4: high level domain (.com, .gov)
#      group 0 would be entire url
pattern = re.compile(r'(https?://)(www\.)?(\w+)(\.\w+)')

matches = pattern.finditer(url)
for match in matches:
    print(match.group(1))  # try group 0, 1, 2

https://
http://
https://
https://


In [124]:
# use the groups from above to specify which part of the strings
#   you are interested (so it can be isolated if desired)

subbed_urls = pattern.sub(r'\3\4', urls)
print(subbed_urls)


google.com
coryms.com
youtube.com
nasa.gov



### Findall Method

finditer returns matches plus other metadata info
findall will just return the matches

In [129]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')

## uses finditer
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)
    
## uses findall  (just produces the groups)
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(212, 223), match='Mr. Schafer'>
<re.Match object; span=(224, 232), match='Mr Smith'>
<re.Match object; span=(233, 241), match='Ms Davis'>
<re.Match object; span=(242, 255), match='Mrs. Robinson'>
<re.Match object; span=(256, 261), match='Mr. T'>
Mr
Mr
Ms
Mrs
Mr


<re.Match object; span=(212, 223), match='Mr. Schafer'>
<re.Match object; span=(224, 232), match='Mr Smith'>
<re.Match object; span=(233, 241), match='Ms Davis'>
<re.Match object; span=(242, 255), match='Mrs. Robinson'>
<re.Match object; span=(256, 261), match='Mr. T'>


### Match Method

determine if a string is present within another string

In [134]:
## match matches if a string is present within another string
##   only matches at beginning of string

sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'Start')

matches = pattern.match(sentence)
## match produces true false so don't need to iterate with for loop

print(matches)

<re.Match object; span=(0, 5), match='Start'>


### Search Method
searches for a string within a string
only prints first match it finds

In [136]:
sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'bring')

matches = pattern.search(sentence)
## search produces true false so don't need to iterate with for loop

print(matches)

## if search for string that isn't present, produces 'None'

<re.Match object; span=(26, 31), match='bring'>


### Flags

like shortcut functions that can help cut down on syntax

In [139]:
sentence = 'Start a sentence and then bring it to an end'

## find 'start' at beginning of text and ignore upper or lower case
pattern = re.compile(r'start', re.IGNORECASE)

matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>
