### re module in Python to deal with Regular Expressions (also called RegEx)

In [2]:
import re

In [None]:
'''Rikkis email id is rikki@gmail.com
raheela@hotmail.com is Raheelas email
In my contact book ashish@yahoo.com is saved as email for Ashish
Taniya goes by at least two email IDs taniya@rediff.com and taaaaaaaniya@outlook.com'

{<alphanum>+@<alphanumeric>[.]com}

In [None]:
# We build patterns using:

# Sequences
# Quantifiers
# Metacharacters

# re module also has useful functions/methods to help us in our task of extracting patterns from text. 

In [None]:
#1. Regex pattern matches character by character
#2. It performs a greedy match

In [None]:
'Baaataaaaataaaaa'

# {a+}

# aaa
# aaaaa
# aaaaa

In [None]:
# We use r (raw string) before a pattern definition in regex to avoid having Python treat the \ in regex sequences as 
# escape characters.

In [8]:
str1 = 'She sells sea shells on the sea shore'

pattern = r'\bs\w+'

output = re.findall(pattern, str1)

print(output)

['sells', 'sea', 'shells', 'sea', 'shore']


In [None]:
# \b = look at beginning of word
# s = alphabet s
# \w = alphanumeric character
# + = 1 or more occurrences of the character preceding it. 

#### Common regex methods

In [21]:
# findall method - outputs all the matches of the pattern inside a list. 

str1 = 'She sells sea shells on the sea shore'

pattern = r'\bs\w+'

output = re.findall(pattern, str1)

print(output)

['sells', 'sea', 'shells', 'sea', 'shore']


In [22]:
# search method - finds the first occurrence of the pattern and outputs a match object

In [10]:
output = re.search(pattern,str1)

print(output)

<re.Match object; span=(4, 9), match='sells'>


In [23]:
# match method - checks if the pattern we want to match is at the beginning of the string.

In [24]:
pattern = r'S\w+'

output = re.match(pattern,str1)

print(output)

<re.Match object; span=(0, 3), match='She'>


In [25]:
pattern = r's\w+'

output = re.match(pattern,str1)

print(output)

None


In [None]:
# finditer method - finds ALL the occurences of the pattern in the input string, but unlike findall - returns an iterator
# object which in turn contains the match objects for each of the pattern matches.

In [None]:
iterator object
[match object
match object
match object]

In [29]:
str1 = 'She sells sea shells on the sea shore'

pattern = r'\bs\w+'

output = re.finditer(pattern, str1)

print(output)

<callable_iterator object at 0x000001A1B4A35430>


In [27]:
for match in output:
    print(match)

<re.Match object; span=(4, 9), match='sells'>
<re.Match object; span=(10, 13), match='sea'>
<re.Match object; span=(14, 20), match='shells'>
<re.Match object; span=(28, 31), match='sea'>
<re.Match object; span=(32, 37), match='shore'>


In [30]:
for match in output:
    print(match.span(), match.group(), sep = '--->')
    

(4, 9)--->sells
(10, 13)--->sea
(14, 20)--->shells
(28, 31)--->sea
(32, 37)--->shore


In [31]:
# compile method - compiles the given pattern into a pattern object. This is the most basic class of regex which actually
# helps to do the pattern matching.



str1 = 'She sells sea shells on the sea shore'

pattern = re.compile(r'\bs\w+')

output = re.findall(pattern, str1)

print(pattern)
print(type(pattern))
print(output)

re.compile('\\bs\\w+')
<class 're.Pattern'>
['sells', 'sea', 'shells', 'sea', 'shore']


In [32]:
output2 = pattern.findall(str1)

print(output2)

['sells', 'sea', 'shells', 'sea', 'shore']


In [None]:
# match objects have certain methods and attributes. Generally used ones are:

In [33]:
# group() - to output the string that matched our pattern
# span() - start,end index of the match in the form of a tuple
# start() - start index of the match
# end() - end index of the match
# string - <attribute> - The input string where the match(es) was(were) found.

In [15]:
output.group()

'sells'

In [16]:
output.span()

(4, 9)

In [17]:
output.start()

4

In [18]:
output.end()

9

In [20]:
output.string

'She sells sea shells on the sea shore'

In [14]:
print(dir(output))

['__class__', '__class_getitem__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']


In [None]:
# pattern = 'boot*'

# str1 = 'boottttttttttt'

In [None]:
# + = 1 or more occurrences of the character/sequence preceding it
# * = 0 or more occurrences of the character/sequence preceding it.

#### Sequences in Regex

In [None]:
# Special Sequence             Description
# \A                           Matches if the string begins with the given pattern

# \b                           Matches if the word begins or ends with the given character.(\b before pattern to check if it
#                              begins with the pattern and \b after pattern to see if it ends with the specified pattern).
# \B                           It is the opposite of the \b i.e. the word should not start or end with the given regex.
# \d                           Matches any decimal digit, this is equivalent to the set class [0-9]
# \D                           Matches any non-digit character, this is equivalent to the set class [^0-9]
# \s                           Matches any whitespace character.
# \S                           Matches any non-whitespace character
# \w                           Matches any alphanumeric character, this is equivalent to the class [a-zA-Z0-9_].
# \W                           Matches any non-alphanumeric character.
# \Z                           Matches if the string ends with the given regex

In [None]:
# \A - Matches pattern at beginning of string. Almost exactly as metacharacter ^. 
# \Z - Matches pattern at end of string. Almost exactly as metacharacter $.

In [36]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\APe\w+'

# Match at beginning of string
# Pattern should begin with P
# have at least 1 alphanumeric character
# continue matching till alphanumeric characters are matching.

output = re.findall(pattern, str1)

print(output)

[]


In [39]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'^Pe\w+'

output = re.findall(pattern, str1)

print(output)

['Peter']


In [41]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\w+rs\Z'

output = re.findall(pattern, str1)

print(output)

['peppers']


In [43]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\w+d\Z'

output = re.findall(pattern, str1)

print(output)

[]


In [44]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\w+rs$'

output = re.findall(pattern, str1)

print(output)

['peppers']


In [45]:
# \b - Looks for the pattern at the beginning (or end - depending on placement) of the word boundary for the pattern.

str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\bp\w+'

output = re.findall(pattern, str1)

print(output)

['picked', 'peck', 'pickled', 'peppers']


In [46]:
str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\w+d\b'

output = re.findall(pattern, str1)

print(output)

['picked', 'pickled']


In [None]:
# \b (at the beginning of pattern) - \A
# \b (at the end of pattern) - \Z

# Difference?

#\b - looks at beginning or end of word depending on placement while \A(^) and \Z($) look at beginning or end of string.

In [47]:
# \B - Look for pattern but the pattern CANNOT occur at beginning (or end - depending on placement) of word. 

# \B - Match any word that does not begin with the given pattern. - THIS IS COMPLETELY INCORRECT!

str1 = 'Peter Piper picked a peck of pickled peppers'

pattern = r'\Bp\w+'

output = re.findall(pattern, str1)

print(output)

['per', 'ppers']


In [48]:
# \d matches any digits - set class [0 - 9].


str1 = 'Pe4er P1per p1cked a peck of p1ckled peppers'

pattern = r'\d\w+'

output = re.findall(pattern, str1)

print(output)

['4er', '1per', '1cked', '1ckled']


In [49]:
#\D matches any NON DIGIT characters. 

# Alphabets (a to z, A to Z)
# Special Characters - @, %, ?, > ....  INCLUDING SPACE!
# Special sequences \n

str1 = 'Peter P%per picked @ peck of p1ckled pepper5'

pattern = r'\D+'

output = re.findall(pattern, str1)

print(output)

['Peter P%per picked @ peck of p', 'ckled pepper']


In [None]:
small a to small z
capital a to capital z

In [63]:
# \s matches any character whitespace character

str1 = 'Taniya Pal 100 Raheela Shabreen 100 Rajat Bhat 100 Rashmika Saravanan 100'

pattern = r'[a-zA-Z]\w+\s\w+'

output = re.findall(pattern, str1)

print(output)

['Taniya Pal', 'Raheela Shabreen', 'Rajat Bhat', 'Rashmika Saravanan']


In [3]:
str1 = 'Taniya Pal 100 Marks Raheela Shabreen 100 Marks Rajat Bhat 100 Marks Rashmika Saravanan 100 Marks'

pattern = r'\b(?!100\b|\bMarks\b)[A-Za-z]+\b'

output = re.findall(pattern, str1)

print(output)

['Taniya', 'Pal', 'Raheela', 'Shabreen', 'Rajat', 'Bhat', 'Rashmika', 'Saravanan']


In [65]:
pattern = r'\w+\D\w+'

output = re.findall(pattern, str1)
print(output)

['Taniya Pal', '100 Raheela', 'Shabreen 100', 'Rajat Bhat', '100 Rashmika', 'Saravanan 100']


In [68]:
# \S matches any character that is NOT whitespace character

# Includes:
# Alphabets
# Numbers
# Symbols

str1 = 'Taniya Pal !00 Raheela Shabreen !00 Rajat_Bhat 100 Rashmika Saravanan 100'

pattern = r'\S+'

output = re.findall(pattern, str1)

print(output)

['Taniya', 'Pal', '!00', 'Raheela', 'Shabreen', '!00', 'Rajat_Bhat', '100', 'Rashmika', 'Saravanan', '100']


In [70]:
# \w matches any alphanumeric character and underscore i.e. A to Z, a to z, 0 to 9 and _

str1 = 'Taniya Pal !00 Raheela Shabreen !00 Rajat_Bhat 100 Rashmika Saravanan 100'

pattern = r'\w+'

output = re.findall(pattern, str1)

print(output)

['Taniya', 'Pal', '00', 'Raheela', 'Shabreen', '00', 'Rajat_Bhat', '100', 'Rashmika', 'Saravanan', '100']


In [71]:
# \W matches any NON alphanumeric character or underscore

str1 = 'Taniya Pal !00 Raheela Shabreen !00 Rajat_Bhat 100 Rashmika Saravanan 100'

pattern = r'\W+'

output = re.findall(pattern, str1)

print(output)

[' ', ' !', ' ', ' ', ' !', ' ', ' ', ' ', ' ', ' ']


In [None]:
# There are also Metacharacters.

# MetaCharacters               Description
# \                            Used to drop the special meaning of character following it
# []                           Represent a character class
# ^                            Matches the beginning of string = \A
# $                            Matches the end of string = \Z
# .                            Matches any character except newline
# |                            Means OR (Matches with any of the characters separated by it).

# And Quantifiers

# ?                            Matches zero or one occurrence - It signifies optional character.
# *                            Any number of occurrences (including 0 occurrences)
# +                            One or more occurrences
# {}                           Indicate the number of occurrences of a preceding regex to match.
# ()                           Enclose a group of Regex


In [None]:
# [apr] - will search for any one of the enclosed characters
# [^apr] - will NOT include the enclosed characters as part of search pattern.
# [0-5] - will include range of digits between 0 to 5
# [a-p] - will include range of alphabets between a to p
# [a-zA-Z] - Any alphabets between small a to small z and capital A to capital Z
# [1-5][3-7] - Any numbers that are between 1 to 5 as first digit and 3 to 7 as second digit
# [+*] - Except for caret ^ - the other symbols have no special meaning inside square brackets. So + means we will look for 
# + character in pattern, * means we will look for * character in pattern and so on....

In [10]:
import re

str1 = 'Hoot Boot Coot Loot Soot Foot Moot Root Toot SHoot GRoot'

pattern = r'\b[HLSR]oot'

result = re.findall(pattern, str1)

print(result)

['Boot', 'Coot', 'Foot', 'Moot', 'Toot']


In [11]:
str1 = 'Hoot Boot Coot Loot Soot Foot Moot Root Toot SHoot GRoot'

pattern = r'\b[^HLSR]oot'

result = re.findall(pattern, str1)

print(result)

['Boot', 'Coot', 'Foot', 'Moot', 'Toot']


In [12]:
str1 = 'Hoot Boot Coot Loot Soot Foot Moot Root Toot SHoot GRoot'

pattern = r'\b[B-L]oot'

result = re.findall(pattern, str1)

print(result)

['Hoot', 'Boot', 'Coot', 'Loot', 'Foot']


In [13]:
str1 = 'hoot Boot Coot loot Soot foot Moot Root Toot SHoot GRoot'

pattern = r'\b[B-Lb-l]oot'

result = re.findall(pattern, str1)

print(result)

['hoot', 'Boot', 'Coot', 'loot', 'foot']


In [14]:
str1 = 'hoot Boot Coot loot Soot foot Moot Root Toot SHoot GRoot'

pattern = r'[HLSR]oot'

result = re.findall(pattern, str1)

print(result)

['Soot', 'Root', 'Hoot', 'Root']


In [17]:
str1 = '56 79 32 41 12 63 88'

pattern = r'[5-8][2-6]'

result = re.findall(pattern, str1)

print(result)

['5', '6', '7', '3', '2', '4', '2', '6', '3', '8', '8']


In [18]:
str1 = 'I am a shooting * and light up the sky. You + me makes a symphony'

pattern = r'[t*+.]\s\w+'

result = re.findall(pattern, str1)

print(result)

['* and', 't up', '. You', '+ me']


In [20]:
str1 = 'Jack and Jill went up the hill \nTo Fetch a pail of water'

pattern = r'.+'

result = re.findall(pattern, str1)

print(result)

['Jack and Jill went up the hill ', 'To Fetch a pail of water']


In [25]:
str1 = 'Monkey Donkey Ponkey'

pattern = r'Monkey|Donkey'

result = re.findall(pattern, str1)

print(result)

['Monkey', 'Donkey']


In [26]:
emails = 'MJ mj@gmail.com Kobe kb24@hotmail.com LeBron lbj@rediff.com'

pattern = r'\w+@gmail.com|\w+@hotmail.com'

result = re.findall(pattern, emails)

print(result)

['mj@gmail.com', 'kb24@hotmail.com']


In [None]:
# * means 0 or more occurrences of character preceding it - 0 to infinity
# + means 1 or more occurrences of character preceding it - 1 to infinity
# ? means 0 or 1 occurrences of character preceding it. 

In [27]:
str1 = 'colour color colouur'

pattern = r'colou?r'

result = re.findall(pattern, str1)

print(result)

['colour', 'color']


In [28]:
str1 = 'colouuur colouur colour colouuuuuuuuuuur colouuuur'

pattern = r'colou{3}r'

result = re.findall(pattern, str1)

print(result)

['colouuur']


In [30]:
str1 = 'colouuur colouur colour colouuuuuuuuuuur colouuuur'

pattern = r'colou{2,4}r'

result = re.findall(pattern, str1)

print(result)

['colouuur', 'colouur', 'colouuuur']


In [37]:
# groups

str1 = 'I love basketball * You love football * She loves volleyball * They love throwball * We love baseball'

strlst = str1.split('*')

print(strlst)

['I love basketball ', ' You love football ', ' She loves volleyball ', ' They love throwball ', ' We love baseball']


In [41]:
for item in strlst:
    match = re.search(r'\w+ love (basket|base)ball', item)
    if match:
        print(match)
        print(match.group())

<re.Match object; span=(0, 17), match='I love basketball'>
I love basketball
<re.Match object; span=(1, 17), match='We love baseball'>
We love baseball


In [54]:
emails = 'Michael Jordan email mj@gmail.com   Kobe Bryant email kb24@hotmail.com LeBron James email lbj@outlook.com'

pattern = r'(\w+) (\w+) email (\w+@\w+[.]com)'

result = re.finditer(pattern, emails)

print(result)

<callable_iterator object at 0x0000020BF3EE7130>


In [55]:
for item in result:
    print(item)
    print(item.groups())
    print(f'Name : {item.group(1)}')
    print(f'Last Name : {item.group(2)}')
    print(f'Email : {item.group(3)}')

<re.Match object; span=(0, 33), match='Michael Jordan email mj@gmail.com'>
('Michael', 'Jordan', 'mj@gmail.com')
Name : Michael
Last Name : Jordan
Email : mj@gmail.com
<re.Match object; span=(36, 70), match='Kobe Bryant email kb24@hotmail.com'>
('Kobe', 'Bryant', 'kb24@hotmail.com')
Name : Kobe
Last Name : Bryant
Email : kb24@hotmail.com
<re.Match object; span=(71, 105), match='LeBron James email lbj@outlook.com'>
('LeBron', 'James', 'lbj@outlook.com')
Name : LeBron
Last Name : James
Email : lbj@outlook.com


In [56]:
#split method in re takes 4 parameters

#re.split(pattern, string, maxsplit=0, flags=0)

#1. The regex pattern - mandatory
#2. The string to be checked - mandatory
#3. Maxsplit - count of how many maximum splits we want
#4. flags - as discussed above. 


In [57]:
emails = 'Michael Jordan email mj@gmail.com   Kobe Bryant email kb24@hotmail.com LeBron James email lbj@outlook.com'

pattern = r' email \w+@\w+[.]com'

result = re.split(pattern,emails)

print(result)

['Michael Jordan', '   Kobe Bryant', ' LeBron James', '']


In [58]:
emails = 'Michael Jordan email mj@gmail.com   Kobe Bryant email kb24@hotmail.com LeBron James email lbj@outlook.com'

pattern = r' email \w+@\w+[.]com'

result = re.split(pattern,emails, maxsplit = 2)

print(result)

['Michael Jordan', '   Kobe Bryant', ' LeBron James email lbj@outlook.com']


In [59]:
#sub method takes in 5 parameters

#1. The regex expression to be matched - mandatory
#2. The replacement string - mandatory
#3. The string to be checked - mandatory
#4. Count - the max number of times the replacement is to be performed - optional
#5. Flag - optional

input_str = 'ayz abc xyz Abc xyz aBc'

sub_str = r'a\w+'
repl_str = r'PQR'

result = re.sub(sub_str, repl_str, input_str)
print(result)

PQR PQR xyz Abc xyz PQR


In [60]:
input_str = 'ayz abc xyz Abc xyz aBc'

sub_str = r'a\w+'
repl_str = r'PQR'

result = re.sub(sub_str, repl_str, input_str, count = 2)
print(result)

PQR PQR xyz Abc xyz aBc


In [61]:
#Subn method is the same as the sub method except it provides the replacement count along with the replaced string as a 
# tuple

result = re.subn(sub_str, repl_str, input_str)

print(result)

('PQR PQR xyz Abc xyz PQR', 3)


In [63]:
result = re.subn(sub_str, repl_str, input_str, count = 10)

print(result)

('PQR PQR xyz Abc xyz PQR', 3)


In [65]:
# Short Name          Long Name         Effect
# re.I                re.IGNORECASE     Makes matching of alphabetic characters case-insensitive
# re.M                re.MULTILINE      Causes start-of-string and end-of-string anchors to match embedded newlines
# re.S                re.DOTALL         Causes the dot metacharacter to match a newline

In [66]:
# re.IGNORECASE - makes the pattern matching case insensitive. Short code re.I

str1 = 'Betty bought some butter but the butter was bitter so Betty bought some better butter to make the bitter butter better'

pattern = r'\bb\w+'

result = re.findall(pattern, str1)

print(result)

['bought', 'butter', 'but', 'butter', 'bitter', 'bought', 'better', 'butter', 'bitter', 'butter', 'better']


In [67]:
result = re.findall(pattern, str1, flags = re.I)

print(result)


['Betty', 'bought', 'butter', 'but', 'butter', 'bitter', 'Betty', 'bought', 'better', 'butter', 'bitter', 'butter', 'better']


In [87]:
# re.MULTILINE treats every new line as a new string. Short code re.M

str1 = """Monkey sees banana\nMonkey dances the salsa\nMonkey runs to banana\nMonkey picks up banana\nMonkey sees hungry baby Monkey cry
Monkey gives baby Monkey banana"""

print(str1)

Monkey sees banana
Monkey dances the salsa
Monkey runs to banana
Monkey picks up banana
Monkey sees hungry baby Monkey cry
Monkey gives baby Monkey banana


In [88]:
pattern = r'^Monkey \w+'

result = re.findall(pattern,str1, flags = re.M)

print(result)

['Monkey sees', 'Monkey dances', 'Monkey runs', 'Monkey picks', 'Monkey sees', 'Monkey gives']


In [90]:
str1 = """Monkey sees banana\nMonkey dances the salsa\nMonkey runs to banana\nMonkey picks up banana\nMonkey sees hungry baby Monkey cry
Monkey gives baby Monkey banana"""

print(str1)

pattern = r'\AMonkey \w+'

result = re.findall(pattern,str1, flags = re.M)

print(result)

Monkey sees banana
Monkey dances the salsa
Monkey runs to banana
Monkey picks up banana
Monkey sees hungry baby Monkey cry
Monkey gives baby Monkey banana
['Monkey sees']


In [95]:
# re.DOTALL includes newline character when . quantifier is used. Short code re.S

pattern = r'\b\w+\b.\b\w+\b'

result = re.findall(pattern, str1, flags = re.S)

print(result)

['Monkey sees', 'banana\nMonkey', 'dances the', 'salsa\nMonkey', 'runs to', 'banana\nMonkey', 'picks up', 'banana\nMonkey', 'sees hungry', 'baby Monkey', 'cry\nMonkey', 'gives baby', 'Monkey banana']


In [None]:
# Find all the email addresses in the following text (no extra spaces or characters allowed. Include all the different domain
# names such as gmail.co, yahoo.co.id etc..)

str_email = '''boleh di kirim ke email saya ekoprasetyo.crb@outlook.com tks...
boleh minta kirim ke db.maulana@gmail.com. 
dee.wien@yahoo.com. .
deninainggolan@yahoo.co.id Senior Quantity Surveyor
Fajar.rohita@hotmail.com, terimakasih bu Cindy Hartanto
firmansyah1404@gmail.com saya mau dong bu cindy
fransiscajw@gmail.com
Hi Cindy ...pls share the Salary guide to donny_tri_wardono@yahoo.co.id thank a'''

In [None]:
# Find all the phone numbers. No extra spaces or characters allowed. 

str_phone = '''<p><strong>Kuala Lumpur</strong><strong>:</strong> +60 (0)3 2723 7900</p>
        <p><strong>Mutiara Damansara:</strong> +60 (0)3 2723 7900</p>
        <p><strong>Penang:</strong> + 60 (0)4 255 9000</p>
        <h2>Where we are </h2>
        <strong>&nbsp;Call us on:</strong>&nbsp;+6 (03) 8924 8686
        </p></div><div class="sys_two">
    <h3 class="parentSchool">General enquiries</h3><p style="FONT-SIZE: 11px">
     <strong>&nbsp;Call us on:</strong>&nbsp;+6 (03) 8924 8000
+ 60 (7) 268-6200 <br />
 Fax:<br /> 
 +60 (7) 228-6202<br /> 
Phone:</strong><strong style="color: #f00"> +601-4228-8055</strong>'''