In [1]:
import re
from collections.abc import Iterable


### Finding raw-string patterns


In [2]:
test_string = "123abc45678abc@"
pattern = re.compile(r"abc")

In [3]:
#finditer() returns match object with span and match 
finditer_matches = re.finditer(pattern, test_string)

In [4]:
#findall() returns only match
findall_matches = re.findall(pattern, test_string)

In [5]:
#search() returns first match object
search_matches = re.search(pattern, test_string)

In [6]:
#returns match object if pattern is found at the begining of the string, else None
match_matches = re.match(pattern, test_string)

In [7]:
def print_match(matches):
    """
    takes in match object, 
    returns the match objects that match the pattern specified in the argument
    
    """
    if isinstance(matches, Iterable):
        for match in matches:
            print(match)
    elif matches == None:
        return("There was no match")
    else:
        print(matches)

In [8]:
print_match(finditer_matches)

<re.Match object; span=(3, 6), match='abc'>
<re.Match object; span=(11, 14), match='abc'>


In [9]:
print_match(findall_matches)

abc
abc


In [10]:
print_match(search_matches)

<re.Match object; span=(3, 6), match='abc'>


In [11]:
print_match(match_matches)

'There was no match'

##### Match object methods - span(), start(), end(), group()

In [12]:
pattern = re.compile(r"123")
matches = re.finditer(pattern, test_string)
for match in matches:
    print(" Span  :",match.span(),
          "\n","Start :",match.start(),
          "\n","End   :",match.end(),
          "\n","Group :", match.group())

 Span  : (0, 3) 
 Start : 0 
 End   : 3 
 Group : 123


### Meta-characters 
##### #. , ^ , * , \$ , + , ? , { , } , [ , ] , \ , | , ( , ) 
 
 
``. - Any character except new line``                                                                                     
``^ - Starts with specified pattern e.g "^this is a"``                                                                 
``$ - Ends with specified pattern e.g "notebook\$"``                                                                 
``* - Zero or more occurences``                                                                                       
``+ - One or more occurences``                                                                                         
``{} - Exactly the specified number of occurences e.g 2 occurences as {2}``                                           
``[] - Set of characters e.g [0-9], [a-m]``                                                                           
``\ - Special sequence (or escape special characters) e.g "\d"``                                                       
``| - Either``                                                                                                         
``() - Capture and group``

Reference - [Python Engineer](https://www.youtube.com/watch?v=AEE9ecgLgdQ) 

In [13]:
def execute(pattern, test_string):
    """
    Returns the match object that matches specified 'pattern' in 'test string'
    """
    pattern = re.compile(pattern)
    matches = re.finditer(pattern, test_string)
    return(print_match(matches))

In [14]:
execute(r".", "This@is")

<re.Match object; span=(0, 1), match='T'>
<re.Match object; span=(1, 2), match='h'>
<re.Match object; span=(2, 3), match='i'>
<re.Match object; span=(3, 4), match='s'>
<re.Match object; span=(4, 5), match='@'>
<re.Match object; span=(5, 6), match='i'>
<re.Match object; span=(6, 7), match='s'>


In [15]:
execute(r"^Hey", "Hey boy")

<re.Match object; span=(0, 3), match='Hey'>


In [16]:
execute(r"^Hey", "Hello there!")

In [17]:
execute(r"end$", "The end")

<re.Match object; span=(4, 7), match='end'>


##### Special Characters 
``\d - Matches any decimal digit;[0-9]``                                                                              
``\D - Matches any non-decimal digit``                                                                                
``\s - Matches any white-space character (space " ", tab "\t", newline "\n"``                                         
``\S - Matches any non-whitespace character``                                                                         
``\w - Matches any alpha-numeric (word) character; [a-zA-Z0-9]``                                                       
``\W - Matches non-alphanumeric charcters``                                                                           
``\b - Matches where the specified characters are at the begining or end of a word``                                 
``\B - Matches where the specified characters are present but are not at the begining or end of a word``              

Reference - [Python Engineer](https://www.youtube.com/watch?v=AEE9ecgLgdQ) 

In [18]:
execute(r"\d", "How2@")

<re.Match object; span=(3, 4), match='2'>


In [19]:
execute(r"\D", "How2@")

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='o'>
<re.Match object; span=(2, 3), match='w'>
<re.Match object; span=(4, 5), match='@'>


In [20]:
execute(r"\s", "How are you")

<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(7, 8), match=' '>


In [21]:
execute(r"\S", "How are you")

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='o'>
<re.Match object; span=(2, 3), match='w'>
<re.Match object; span=(4, 5), match='a'>
<re.Match object; span=(5, 6), match='r'>
<re.Match object; span=(6, 7), match='e'>
<re.Match object; span=(8, 9), match='y'>
<re.Match object; span=(9, 10), match='o'>
<re.Match object; span=(10, 11), match='u'>


In [22]:
execute(r"\w", "How are you.")

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='o'>
<re.Match object; span=(2, 3), match='w'>
<re.Match object; span=(4, 5), match='a'>
<re.Match object; span=(5, 6), match='r'>
<re.Match object; span=(6, 7), match='e'>
<re.Match object; span=(8, 9), match='y'>
<re.Match object; span=(9, 10), match='o'>
<re.Match object; span=(10, 11), match='u'>


In [23]:
execute(r"\W", "How are you.")

<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(7, 8), match=' '>
<re.Match object; span=(11, 12), match='.'>


In [24]:
execute(r"\bhey", "first hey, second hey, third _hey")

<re.Match object; span=(6, 9), match='hey'>
<re.Match object; span=(18, 21), match='hey'>


In [25]:
execute(r"\Bhey", "first hey, second hey, third _hey")

<re.Match object; span=(30, 33), match='hey'>


In [26]:
execute(r"\d", "How12") == execute(r"[0-9]", "How12")

<re.Match object; span=(3, 4), match='1'>
<re.Match object; span=(4, 5), match='2'>
<re.Match object; span=(3, 4), match='1'>
<re.Match object; span=(4, 5), match='2'>


True

In [27]:
execute(r"[0-9h]","Howis123hey")

<re.Match object; span=(5, 6), match='1'>
<re.Match object; span=(6, 7), match='2'>
<re.Match object; span=(7, 8), match='3'>
<re.Match object; span=(8, 9), match='h'>


##### Using Sets

In [28]:
execute(r"[a-z]", "HEYhey")

<re.Match object; span=(3, 4), match='h'>
<re.Match object; span=(4, 5), match='e'>
<re.Match object; span=(5, 6), match='y'>


In [29]:
execute(r"[A-Z]", "HEYhey")

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='E'>
<re.Match object; span=(2, 3), match='Y'>


In [30]:
execute(r"[a-zA-Z]", "HEYhey")

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='E'>
<re.Match object; span=(2, 3), match='Y'>
<re.Match object; span=(3, 4), match='h'>
<re.Match object; span=(4, 5), match='e'>
<re.Match object; span=(5, 6), match='y'>


In [31]:
execute(r"[a-zA-Z0-9]", "1HEYhey")

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='H'>
<re.Match object; span=(2, 3), match='E'>
<re.Match object; span=(3, 4), match='Y'>
<re.Match object; span=(4, 5), match='h'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='y'>


#### Quantifiers

`` *     - 0 or more characters``                                                                                               
`` +     - 1 or more characters``                                                                                                                                      
`` ?     - 0 or 1 (optional character)``   
``{x}   - x no of characters``                                                                                           
``{x,y} - Between x and y no of characters``                                                                                                                                                      

In [32]:
execute(r"\d*", "Boys")

<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(1, 1), match=''>
<re.Match object; span=(2, 2), match=''>
<re.Match object; span=(3, 3), match=''>
<re.Match object; span=(4, 4), match=''>


In [33]:
execute(r"\d+", "Boys1")

<re.Match object; span=(4, 5), match='1'>


In [34]:
def execute_list(pattern, object_list):
    """returns match objects in 'object_list' that match the 'pattern' 
    """
    object_list = list(object_list)
    for _ in object_list:
        execute(pattern,_)

In [35]:
# return the last digit(s) in a list of ID's
ID = ['Ab0213',
      'AB654', 
      'AB002',
      'AB862']
id_pattern = r"\d+$"
execute_list(id_pattern, ID)

<re.Match object; span=(2, 6), match='0213'>
<re.Match object; span=(2, 5), match='654'>
<re.Match object; span=(2, 5), match='002'>
<re.Match object; span=(2, 5), match='862'>


In [36]:
#check for valid '.com' email
emails = ['debo@yahoo.com',
          'debo@yahoo.ng', 
          'sam@gmail.com', 
          'sam@hotlink.com', 
          'henry@gmail.uk']
dot_com_email_pattern = r"[a-zA-z]+\w+@\w+\.com"
execute_list(dot_com_email_pattern, emails)

<re.Match object; span=(0, 14), match='debo@yahoo.com'>
<re.Match object; span=(0, 13), match='sam@gmail.com'>
<re.Match object; span=(0, 15), match='sam@hotlink.com'>


In [37]:
date_list = ['Hey 2020/06/06',
             '2567', 
             '08-09-1984', 
             'error', 
             '2005-08-29']
date_pattern = r"(\d{4}([-/|]\d{2}){2})"  #yyyy-mm-dd or yyyy/mm/dd 
execute_list(date_pattern, date_list)

<re.Match object; span=(4, 14), match='2020/06/06'>
<re.Match object; span=(0, 10), match='2005-08-29'>


#### Using ``.group()``

- A regular expression can be divided into groups using brackets.
- Using the method  ``.group()`` or `` .group(0)`` returns the entire match object.
- Using the method  ``.group(i)`` returns the object in the $i^{th}$ group of the match object

In [38]:
#extracting coutry ID and order no from valid order id's
order_id =['Lag356',
           'UK234',
           'SA563',
           'LA478',
           'Null',
           'GH789',
           'Lag357']
pattern = re.compile(r"([a-zA-Z]+)(\d+)")
for i in order_id:
    matches = re.finditer(pattern,i)
    for match in matches:
        print("Country ID: %s , Order No: %s" 
              %(match.group(1),match.group(2)))

Country ID: Lag , Order No: 356
Country ID: UK , Order No: 234
Country ID: SA , Order No: 563
Country ID: LA , Order No: 478
Country ID: GH , Order No: 789
Country ID: Lag , Order No: 357


#### Using ``.split()``

In [39]:
#extracting a list containing, first, last, and optional other name
order_id =['Michael Jon',
           'Tobi Mike James',
           'Ebun Josh',
           'Danny Walter James']
pattern = re.compile(r" ")
for i in order_id:
    splitted_names = re.split(pattern,i)
    print(splitted_names)

['Michael', 'Jon']
['Tobi', 'Mike', 'James']
['Ebun', 'Josh']
['Danny', 'Walter', 'James']


#### Using ``.sub()``

In [40]:
def replace_with(item,pattern_to_replace,substitute):
    """replaces match object that matches 'pattern_to_replace' with 'substitute'
    """
    new_item = []
    item = list(item)
    pattern = re.compile(pattern_to_replace)
    for i in item:
        item_without_space = re.sub(pattern,substitute,i)
        new_item.append(item_without_space)
    return(new_item)

In [41]:
#replace k/K with '000', million/Million with '000000' and billion/Billion with '000000000'
prices = ['20 k', '10Billion','40billion','10million', '15 million', '35Million','50trillion']

In [42]:
price_pattern_dict = {r' ':'',
                r'k':'000',
                r'million':'000000',
                r'Million':'000000',
                r'billion':'000000000',
                r'Billion':'000000000',
                r'trillion':'000000000000',
                r'Trillion':'000000000000'}

for pattern, replace_value in price_pattern_dict.items():
    prices = replace_with(prices, pattern, replace_value)

In [43]:
prices

['20000',
 '10000000000',
 '40000000000',
 '10000000',
 '15000000',
 '35000000',
 '50000000000000']