In [1]:
###### Regular Expression Introduction #####

In [2]:
# use https://pythex.org/ for any regex validation
# also check regular expression cheat sheet

In [3]:
"""
\d  digit[0-9]
\w  letter, digit and underscore
\s whitespace character
\D not a digit
\W not a word character
\S not a whitespace character
.  any character except line break
"""

'\n\\d  digit[0-9]\n\\w  letter, digit and underscore\n\\s whitespace character\n\\D not a digit\n\\W not a word character\n\\S not a whitespace character\n.  any character except line break\n'

In [4]:
# Regex quantifiers
"""
+ one or more
{3} Exactly 3 times
{3,5} Three to Five times
{4,} Four or more times
* Zero or more times
? Once or none(optional)
"""

'\n+ one or more\n{3} Exactly 3 times\n{3,5} Three to Five times\n{4,} Four or more times\n* Zero or more times\n? Once or none(optional)\n'

In [None]:
# character classes
"""
[aeiou] Checks for all the letter that is in aeiou
[aeiou]{2} 2 continuous vowels
[a-f] Only letters between a and f
[0-9] digits between 1 and 9
[^k] anything that is not k
[^0-9aeiou] anything that is not in these list of characters
"""

In [None]:
#Anchors and Boundaries
"""
^ Start of string or line
$ End of Strin or line
\b word boundary
"""

In [1]:
# Logical OR (pipe character)
"""
(\(\d{3}\)|\d{3}) \d{3}-\d{4}  - either this or that
(Mr\.|Mrs\.) ([A-Za-z]+ [A-Za-z]+) - Also grouping with ()
"""

'\n(\\(\\d{3}\\)|\\d{3}) \\d{3}-\\d{4}  - either this or that\n(Mr\\.|Mrs\\.) ([A-Za-z]+ [A-Za-z]+) - Also grouping with ()\n'

In [3]:
import re

pattern = re.compile(r'\d{3} \d{3}-\d{4}') # r is for raw string in front of the string

res = pattern.search('My phone number is 995 211-9993!')

res1 = pattern.search("547875454 rdffgfg")

print(res)
print(res1)

<re.Match object; span=(19, 31), match='995 211-9993'>
None


In [4]:
res.group() # it will return the first macth from the object

'995 211-9993'

In [5]:
res2 = pattern.findall('My phone number is 995 211-9993! and also 960 027-1975#')

In [6]:
res2 #list of all the matches

['995 211-9993', '960 027-1975']

In [8]:
# we can also do directly
re.search(r'\d{3} \d{3}-\d{4}', 'My phone number is 995 211-9993!').group() # this will compile every time you do the validation

'995 211-9993'

In [9]:
#another demo

In [19]:
import re

def extract_phone(input):
    phone_regex = re.compile(r'\d{3} \d{3}-\d{4}\b')
    match = phone_regex.search(input)
    if match:
        return match.group()
    return None

def extract_all_phones(input):
    phone_regex = re.compile(r'\d{3} \d{3}-\d{4}\b')
    match = phone_regex.findall(input)
    if match:
        return match
    return None

def is_valid_phone(input):
    phone_regex = re.compile(r'^\d{3} \d{3}-\d{4}\b$')  #$ symbol must be there to print
    match = phone_regex.search(input)
    if match:
        return True
    return False 

def is_valid_phone_version2(input):
    phone_regex = re.compile(r'\d{3} \d{3}-\d{4}')  #$ symbol must be there to print
    match = phone_regex.fullmatch(input)
    if match:
        return True
    return False   

print(extract_phone("My number is 995 211-9993")) #works
print(extract_phone("My number is 995 211-9993993"))#None
print(extract_phone("995 211-9993 dfgfgfd"))
print(extract_all_phones("995 211-9993 dfgfgfd 995 211-9993"))
print(is_valid_phone("995 211-9993")) #True
print(is_valid_phone("995 211-9993 dfdf")) #False
print(is_valid_phone("ssds 995 211-9993 dfdf ")) #False
print(is_valid_phone_version2("995 211-9993")) #True
print(is_valid_phone_version2("995 211-9993 dfdf")) #False
print(is_valid_phone_version2("ssds 995 211-9993 dfdf ")) #False

995 211-9993
None
995 211-9993
['995 211-9993', '995 211-9993']
True
False
False
True
False
False


In [None]:
#### Parsing URLs with python ####

In [27]:
import re

url_regex = re.compile(r'(https?)://(www\.[A-Za-z-]{2,256}\.[a-z]{2,6})([-a-zA-Z0-9@:%_\+.~#?&//=]*)')

match = url_regex.search("https://www.google.com")

print(match.group())
print(match.group(0)) #all the groups similar to group()
print(match.group(1)) #first matching group
print(match.group(2)) #second matching group
print(match.group(3)) #third matching group
print(match.groups()) #note, this is groups. will return tuple of all the groups

print(f"Protocol: {match.group(1)}")
print(f"Domain: {match.group(2)}")
print(f"Everything else in the given URL: {match.group(3)}")

https://www.google.com
https://www.google.com
https
www.google.com

('https', 'www.google.com', '')
Protocol: https
Domain: www.google.com
Everything else in the given URL: 


In [31]:
#another demo - Symbolic Group Names

import re

def parse_name(input):
    name_regex = re.compile(r'^(Mr\.|Mrs\.|Ms\.|Mdme\.) (?P<first>[A-Za-z]+) (?P<last>[A-Za-z]+)$') #?P is a group name. we can use it in the match.group() method later
    matches = name_regex.search(input)
    print(matches.group('first'))
    print(matches.group('last'))
    
parse_name("Ms. Kanu Hasini")

Kanu
Hasini


In [32]:
##### Regex Compilation Flags #####

In [36]:
import re

pat = re.compile(r'^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$') # This line can be re-written as below. Also we can add comments

pattern = re.compile(r"""
    ^([a-z0-9_\.-]+)     #first part of email
    @                    #single @ sign
    ([\da-z\.-]+)        #email provider
    \.                   #single period
    ([a-z\.]{2,6})$      #com, org, net, etc.
""", re.VERBOSE | re.IGNORECASE) # wecan use re.X or re.VERBOSE.. wecan use re.I or re.IGNORECASE..

match = pattern.search("kanuhasinid@gmail.com")
print(match.group())
print(match.groups())
match = pattern.search("KANUHSAINID@GMAIL.COM")
print(match.group())
print(match.groups())

kanuhasinid@gmail.com
('kanuhasinid', 'gmail', 'com')
KANUHSAINID@GMAIL.COM
('KANUHSAINID', 'GMAIL', 'COM')


In [37]:
#####  Regex Substitution Basic #### using sub() method

In [47]:
import re
text = "Last night Mrs. Bru and Mr. Dhinesh played with Ms. Kanu"

pattern = re.compile(r'(Mr.|Mrs.|Ms.) ([a-z])[a-z]+', re.I)

#print(pattern.findall(text)) #['Mrs.', 'Mr.', 'Ms.']
#print(pattern.search(text).group()) #Mrs. Bru
result = pattern.sub("####", text)
result1 = pattern.sub("\g<1> Good", text)
result2 = pattern.sub("\g<1> \g<2>", text)
print(result)
print(result1)
print(result2)

Last night #### and #### played with ####
Last night Mrs. Good and Mr. Good played with Ms. Good
Last night Mrs. B and Mr. D played with Ms. K


In [48]:
### Swapping File Names ###

In [53]:
import re

titles = [
    "Significant Others (1987)",
    "Tales of the City (1978)",
    "The Days of Anna Madrigal (2014)",
    "Mary Ann in Autumn (2010)",
    "Further Tales of the City (1982)",
    "Babycakes (1984)",
    "More Tales of the City (1980)",
    "Sure of You (1989)",
    "Michael Tolliver Lives (2007)"
]
fixed_book_titles = []
pattern = re.compile(r'(^[\w ]+) \((\d{4})\)')
for book in titles:
    result = pattern.sub("\g<2> - \g<1>", book)
    fixed_book_titles.append(result)
fixed_book_titles.sort()
print(fixed_book_titles)


['1978 - Tales of the City', '1980 - More Tales of the City', '1982 - Further Tales of the City', '1984 - Babycakes', '1987 - Significant Others', '1989 - Sure of You', '2007 - Michael Tolliver Lives', '2010 - Mary Ann in Autumn', '2014 - The Days of Anna Madrigal']
