# Regular Expressions 
can be used to search, edit and manipulate text. This opens up a vast variety of applications in all of the sub-domains under Python.

# Brief: 

A regex is a special sequence of characters that defines a pattern for complex string-matching functionality.

Rather than searching for a fixed substring like '123', suppose you wanted to determine whether a string contains any three consecutive decimal digit characters, as in the strings 'foo123bar', 'foo456bar', '234baz', and 'qux678'.

Strict character comparisons wonâ€™t cut it here. This is where regexes in Python come to the rescue.

findall	:-------->Returns a list containing all matches
search	:-------->Returns a Match object if there is a match anywhere in the string
split	:-------->Returns a list where the string has been split at each match
sub	:-------->Replaces one or many matches with a string

# Syntax for re.search() function: re.search(<regex>, <string>,<flags>)
   
   If a match is found, then re.search() returns a match object. Otherwise, it returns None.

In [6]:
import re
s = 'foo123bar'

if re.search('123', s):
    print('Found a match.', re.search('123', s))
else:
    print('No match.')

Found a match. <_sre.SRE_Match object; span=(3, 6), match='123'>


In [93]:
#Generating Iterator

 
Str = "we need to inform him with the latest information"
 
for i in re.finditer("inform.", Str):
    locTuple = i.span()
    print(locTuple)

(11, 18)
(38, 45)


# Python Regex Metacharacters
The real power of regex matching in Python emerges when  regex contains special characters called metacharacters. These have a unique meaning to the regex matching engine and vastly enhance the capability of the search.

In [7]:
s = 'foo123bar'
re.search('[0-9][0-9][0-9]', s)

<_sre.SRE_Match object; span=(3, 6), match='123'>

In [8]:
print(re.search('[0-9][0-9][0-9]', '12foo34'))

None


In [10]:
# DOT(.) meta character acts like wildcard search
s1 = 'foo123bar'
s2 = 'f1er2sw4'
print(re.search('1.3',s1),"SECOND OUTPUT:",re.search('1.4',s2))

<_sre.SRE_Match object; span=(3, 6), match='123'> SECOND OUTPUT: None


Metacharacters supported:  
[]	A set of characters	"[a-m]"	
\	Signals a special sequence (can also be used to escape special characters)	"\d"	
.	Any character (except newline character)	"he..o"	
^	Starts with	"^hello"	
$	Ends with	"world$"	
*	Zero or more occurrences	"aix*"	
+	One or more occurrences	"aix+"	
{}	Exactly the specified number of occurrences	"al{2}"	
|	Either or	"falls|stays"	
()	Capture and group

In [17]:
# [ ] --> Specifies a specific set of characters to match.
#Example -1 
print(re.search('ba[artz]', 'foobarqux'))
print(re.search('foo[twear]','ballfoot'))
#range of charscters
print(re.search('[a-z][a-z]','FoOtbaLL')) # returns where 2 consecutive lowercase charcters found
print(re.search('[0-9][a-z]','FoOt9baLL')) # returns where consecutive 1no&1 lower case charcter found


<_sre.SRE_Match object; span=(3, 6), match='bar'>
<_sre.SRE_Match object; span=(4, 8), match='foot'>
<_sre.SRE_Match object; span=(3, 5), match='tb'>
<_sre.SRE_Match object; span=(4, 6), match='9b'>


In [21]:
print("where 1st object isnt NUMBER returned>>>",re.search('[^0-9]', '12345foo'))

where 1st object isnt NUMBER returned>>> <_sre.SRE_Match object; span=(5, 6), match='f'>


In [22]:
print(re.search('[abc-]', '123-456'),"where - found is returned")

<_sre.SRE_Match object; span=(3, 4), match='-'>


In [25]:
#DOT
print(re.search('foo.ball', 'fooxball'))
print(re.search('foo.ball', 'foo\nball'))

<_sre.SRE_Match object; span=(0, 8), match='fooxball'>
None


In [28]:
#MATCHES CHR
print(re.search('\w', '#(.a$@&'))
#MATCHES ALPHANUMERIC
print(re.search('\W', '#(.a$@&'))

<_sre.SRE_Match object; span=(3, 4), match='a'>
<_sre.SRE_Match object; span=(0, 1), match='#'>


\d -> matches decimal
\D -> reverse
\s -> matches whitespace includes new line
\S -> reverse

In [29]:
# Combo of metacharacter search
print(re.search('[\d\w\s]', '---3---'))

<_sre.SRE_Match object; span=(3, 4), match='3'>


In [31]:
#Escaping metacharacters -> to avoid metacharacter in spl cases
print(re.search('.','foo.bar'))
print(re.search('\.','foo.bar'))

<_sre.SRE_Match object; span=(0, 1), match='f'>
<_sre.SRE_Match object; span=(3, 4), match='.'>


In [33]:
#TO ESCAPE \ in word
s = r'foo\bar'
print(s)
print(re.search(r'\\', s)) #Using raw string

foo\bar
<_sre.SRE_Match object; span=(3, 4), match='\\'>


Anchors
anchor dictates a particular location in the search string where a match must occur.
FOR START:
^
\A
FOR END:
$
\Z

In [94]:
#CARAT

import re
 
Str = "sat, hat, mat, pat"
 
someStr = re.findall("[^h-m]at", Str)
 
for i in someStr:
    print(i)

sat
pat


In [118]:
test_list = ['coro\nna', 'i\ns', 'da\nngerous'] 
print(test_list)

res = [] 
for sub in test_list: 
    res.append(re.sub('\n', '', sub))
print(res)

['coro\nna', 'i\ns', 'da\nngerous']
['corona', 'is', 'dangerous']


In [36]:
print("WILL WORK---",re.search('^foo', 'foobar'),"WONT WORK:---",re.search('^foo', 'barfoo'))
print("WILL WORK---",re.search('\Afoo', 'foobar'),"WONT WORK:--",re.search('\Afoo', 'barfoo'))

WILL WORK--- <_sre.SRE_Match object; span=(0, 3), match='foo'> WONT WORK:--- None
WILL WORK--- <_sre.SRE_Match object; span=(0, 3), match='foo'> WONT WORK:-- None


In [38]:
print("WONT WORK---",re.search('foo$', 'foobar'),"WILL WORK:---",re.search('foo$', 'barfoo'))
print("WONT WORK---",re.search('foo\Z', 'foobar'),"WILL WORK:--",re.search('foo\Z', 'barfoo'))

WONT WORK--- None WILL WORK:--- <_sre.SRE_Match object; span=(3, 6), match='foo'>
WONT WORK--- None WILL WORK:-- <_sre.SRE_Match object; span=(3, 6), match='foo'>


In [60]:
pattern='a{2,3}'
print("WONT WORK-----",re.search(pattern, 'abc dat'),"WILL WORK-----",re.search(pattern,'abc daat'))

WONT WORK----- None WILL WORK----- <_sre.SRE_Match object; span=(5, 7), match='aa'>


In [95]:
# Replacement

Food = "hat rat mat pat"
regex = re.compile("[r]at")
Food = regex.sub("food", Food)
print(Food)

hat food mat pat


In [40]:
txt = "The rain in Spain"
print(re.search("^The.*Spain$", txt)) #Start with  The & end with $

<_sre.SRE_Match object; span=(0, 17), match='The rain in Spain'>


Sets
A set is a set of characters inside a pair of square brackets [] with a special meaning:

[arn]	Returns a match where one of the specified characters (a, r, or n) are present	
[a-n]	Returns a match for any lower case character, alphabetically between a and n	
[^arn]	Returns a match for any character EXCEPT a, r, and n	
[0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
[0-9]	Returns a match for any digit between 0 and 9	
[0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
[a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case	
[+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string

In [41]:
txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


In [61]:
string = 'hello 12 hi 89. Howdy 34'
pattern = '\d+'

result = re.findall(pattern, string) 
print(result)

['12', '89', '34']


In [42]:
x = re.findall("Portugal", txt)
print(x) #return empty list

[]


In [43]:
x = re.search("Portugal", txt)
print(x) # returns None

None


In [44]:
x = re.split("\s", txt)
print(x) #split at each whitespace

['The', 'rain', 'in', 'Spain']


In [62]:
string = 'Twelve:12 Eighty nine:89.'
pattern = '\d+'

result = re.split(pattern, string) 
print(result)

['Twelve:', ' Eighty nine:', '.']


In [45]:
x = re.split("\s", txt,1)
print(x) #split at each whitespace but maxsplit =1

['The', 'rain in Spain']


In [47]:
x = re.sub("\s", "------", txt,2)
print(x) #replace every whitespace with ------ only for 1st 2 occurances

The------rain------in Spain


In [52]:
txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.span()) #looks for word S & print from the position till end.

(12, 17)


In [53]:
x = re.search(r"\bS\w+", txt)
print(x.string) # print incoming text as it is.

The rain in Spain


In [54]:
x = re.search(r"\bS\w+", txt)
print(x.group()) #Print part of word where there is match

Spain


In [63]:
#To remove whitespaces
# Program to remove all whitespaces
import re

# multiline string
string = 'abc 12\
de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.sub(pattern, replace, string) 
print(new_string)

abc12de23f456


In [66]:

import re

# multiline string
string = 'abc 12\
de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'
replace = ''

new_string = re.subn(r'\s+', replace, string) 
print(new_string)

('abc12de23f456', 4)


In [67]:
import re

string = '39801 356, 2102 1111'

# Three digit number followed by space followed by two digit number
pattern = '(\d{3}) (\d{2})'

# match variable contains a Match object.
match = re.search(pattern, string) 

if match:
    print(match.group())
else:
    print("pattern not found")

801 35


In [68]:
import re

string = '\n and \r are escape sequences.'

result = re.findall(r'[\n\r]', string) 
print(result)

# Output: ['\n', '\r']

['\n', '\r']


In [69]:
s = 'foo\nbar\nbaz'
print(re.search('^bar', s, re.MULTILINE))

<_sre.SRE_Match object; span=(4, 7), match='bar'>


In [70]:
def f(match_obj):
    s = match_obj.group(0)  # The matching string

    # s.isdigit() returns True if all characters in s are digits
    if s.isdigit():
        return str(int(s) * 10)
    else:
        return s.upper()

re.sub(r'\w+', f, 'foo.10.bar.20.baz.30')

'FOO.100.BAR.200.BAZ.300'

In [91]:
import re
 
Nameage = '''
Janice is 22 and Theon is 33
Gabriel is 44 and Joey is 21
'''
ages = re.findall(r'\d{1,2}', Nameage)
names = re.findall(r'[A-Z][a-z]*',Nameage)

ageDict = {}
x=0
for eachname in names:
    ageDict[eachname] = ages[x]
    x+=1
print(ageDict)

['Janice', 'Theon', 'Gabriel', 'Joey']
['22', '33', '44', '21']
{'Janice': '22', 'Theon': '33', 'Gabriel': '44', 'Joey': '21'}


In [92]:
import re
 
allinform = re.findall("inform","We need to inform him with the latest information!")
 
for i in allinform:
    print(i)

inform
inform


In [141]:
ph_list=["444-122-1234",'123-122-78999','111-123-23','67-7890-2019']

for ph in ph_list:
    print(ph)
    if re.search("\w{3}-\w{3}-\w{4}$", ph):
        print("Valid phone number",re.search("\w{4}", ph))
    else:
        print("send it for correction")

444-122-1234
Valid phone number <_sre.SRE_Match object; span=(8, 12), match='1234'>
123-122-78999
send it for correction
111-123-23
send it for correction
67-7890-2019
send it for correction


In [206]:
import urllib3.request
from re import findall
import requests
 
url = "http://www.summet.com/dmsi/html/codesamples/addresses.html"
response = urllib.request.urlopen(url)
html_res = response.read()
htmlStr = html_res.decode()
#para = re.findall(r'<li>(.*?)</li>', str(htmlStr)) 
pdata = findall("\(\d{3}\) \d{3}-\d{4}", htmlStr)
for item in pdata:
    print(item)

(257) 563-7401
(372) 587-2335
(786) 713-8616
(793) 151-6230
(492) 709-6392
(654) 393-5734
(404) 960-3807
(314) 244-6306
(947) 278-5929
(684) 579-1879
(389) 737-2852
(660) 663-4518
(608) 265-2215
(959) 119-8364
(468) 353-2641
(248) 675-4007
(939) 353-1107
(570) 873-7090
(302) 259-2375
(717) 450-4729
(453) 391-4650
(559) 104-5475
(387) 142-9434
(516) 745-4496
(326) 677-3419
(746) 679-2470
(455) 430-0989
(490) 936-4694
(985) 834-8285
(662) 661-1446
(802) 668-8240
(477) 768-9247
(791) 239-9057
(832) 109-0213
(837) 196-3274
(268) 442-2428
(850) 676-5117
(861) 546-5032
(176) 805-4108
(715) 912-6931
(993) 554-0563
(357) 616-5411
(121) 347-0086
(304) 506-6314
(425) 288-2332
(145) 987-4962
(187) 582-9707
(750) 558-3965
(492) 467-3131
(774) 914-2510
(888) 106-8550
(539) 567-3573
(693) 337-2849
(545) 604-9386
(221) 156-5026
(414) 876-0865
(932) 726-8645
(726) 710-9826
(622) 594-1662
(948) 600-8503
(605) 900-7508
(716) 977-5775
(368) 239-8275
(725) 342-0650
(711) 993-5187
(882) 399-5084
(287) 755-

In [207]:
t1='Cecilia Chapman<br/>711-2880 Nulla St.<br/>Mankato Mississippi 96522<br/>(257) 563-7401'
pdata = findall('\(\d{3}\) \d{3}-\d{3}.$', str(t1))
print(pdata)

['(257) 563-7401']
