## Regular Expressions

In [1]:
import re

In [3]:

pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

s = input('Enter tel. number: ')
if re.match(pattern, s):
    print('Number accepted.')
else:
    print('Incorrect format.')


Enter tel. number: 111-222-3333
Number accepted.


In [4]:


pattern = r'\d\d\d[ -]\d\d\d[ -]\d\d\d\d$'

s = input('Enter tel. number: ')
if re.match(pattern, s):
   print('Number accepted.')
else:
   print('Incorrect format.')

Enter tel. number: 111-222-3333
Number accepted.


It must match an end-of-string, $. This means there cannot be any more input in the target string after these last four digits are matched.<br>

the use of **fullmatch** makes the end-of-string character unnecessary

In [7]:
pattern = r'\d\d\d[ -]\d\d\d[ -]\d\d\d\d'

s = input('Enter tel. number: ')
if re.fullmatch(pattern, s):
   print('Number accepted.')
else:
   print('Incorrect format.')

Enter tel. number: 111-222-3333
Number accepted.


COMPILING VERSUS RUNNING<br>

The processing of regular expressions takes two major steps.<br>

A regular expression pattern is analyzed and then compiled into a series of data structures collectively called a state machine.<br>

The actual process of matching is considered “run time” for the regular-expression evaluator, as opposed to “compile time.” During run time, the program traverses the state machine as it looks for a match.<br>

In [9]:
# compile function

reg1 = re.compile(r'ca*b$')  # Compile the pattern!

def test_item(s):
    if re.match(reg1, s):
        print(s, 'is a match.')
    else:
        print(s, 'is not a match!')
        

In [10]:
test_item('caab')
test_item('caaxxb')

caab is a match.
caaxxb is not a match!


In [13]:
# IGNORECASE Uppercase


if re.match('m*ack', 'Mack the Knife', re.IGNORECASE):
    print ('Success.')

Success.


In [14]:
# same
if re.match('m*ack', 'Mack the Knife', re.I):
    print ('Success.')
    

Success.


In [16]:
# summarize searching, matching, compiling,

if re.match('m*ack', 'Mack the Knife', re.I | re.DEBUG):
    print ('Success.')

MAX_REPEAT 0 MAXREPEAT
  LITERAL 109
LITERAL 97
LITERAL 99
LITERAL 107

 0. INFO 4 0b0 3 MAXREPEAT (to 5)
 5: REPEAT_ONE 6 0 MAXREPEAT (to 12)
 9.   LITERAL_UNI_IGNORE 0x6d ('m')
11.   SUCCESS
12: LITERAL_UNI_IGNORE 0x61 ('a')
14. LITERAL_UNI_IGNORE 0x63 ('c')
16. LITERAL_UNI_IGNORE 0x6b ('k')
18. SUCCESS
Success.


### Meta Characters

In [17]:

if re.match(r'[+*^/-]', '^'):
    print('Success!')

Success!


### Backtracking, Greedy, and Non-Greedy

In [23]:
pat = r'c.*t'
if re.match(pat, 'cat'):
    print('Success!')

Success!


### USING THE MATCH OBJECT

In [25]:

pat = r'(a+)(b+)(c+)'

m = re.match(pat, 'abbcccee')

print(m.group(0)) # entire text matched
print(m.group(1)) # 1st match
print(m.group(2)) # 2nd match
print(m.group(3)) # 3rd match

abbccc
a
bb
ccc


### SEARCHING A STRING FOR PATTERNS

In [26]:

m = re.search(r'\d{2,}', '1 set of 23 owls, 999 doves.')
print('"', m.group(), '" found at ', m.span(), sep='')

"23" found at (9, 11)


### ITERATIVE SEARCHING (“FINDALL”)

In [27]:

s = '1 set of 23 owls, 999 doves.'
print(re.findall(r'\d+', s))


['1', '23', '999']


In [28]:

s = 'What is 1,000.5 times 3 times 2,000?'
print(re.findall(r'\d[0-9,.]*', s))


['1,000.5', '3', '2,000']


In [29]:

s = 'I do not use sophisticated, multisyllabic words!'
print(re.findall(r'\w{6,}', s))


['sophisticated', 'multisyllabic']


In [30]:

s = '12 15+3 100-*'
print(re.findall(r'[+*/-]|\w+', s))


['12', '15', '+', '3', '100', '-', '*']


In [31]:

pat = r'\d{1,3}(,\d{3})*(\.\d*)?'
print(re.findall(pat, '12,000 monkeys and 55.5 cats.'))


[(',000', ''), ('', '.5')]


#### carefull while using iter and group

In [32]:

pat = r'(\d{1,3}(,\d{3})*(\.\d*)?)'
lst = re.findall(pat, '12,000 monkeys on 55.5 cats.')
for item in lst:
    print(item[0])

12,000
55.5


### SEARCHING FOR REPEATED PATTERNS

In [38]:

s = 'The cow jumped over the the moon.'
m = re.search(r'(\w+) \1', s)
print(m.group(), '...found at', m.span())


the the ...found at (20, 27)


In [51]:

s = 'The The United States of of America'
m = re.search(r'(\w+) \1', s)
print(m.group(), '...found at', m.span()) # first match


The The ...found at (0, 7)


In [52]:

s = 'The the cow jumped over the the moon '

m = re.search(r'(\w+) \1', s, flags=re.I)
print(m.group(), '...found at', m.span()) # re.search function reports the first successful match that was found.


The the ...found at (0, 7)


### REPLACING TEXT

In [53]:

s = 'Get me a new dog to befriend my dog.'
s2 = re.sub('dog', 'cat', s)
print(s2)


Get me a new cat to befriend my cat.


In [56]:
# remove duplicates

s = 'The the cow jumped over over the moon moon'
s2 = re.sub(r'(\w+) \1', r'\1', s, flags=re.I)
print(s2)


The cow jumped over the moon


### Fixing the Tagging Problem

In [58]:

pat = r'\d{1,3}(?:,\d{3})*(?:\.\d*)?\b'
s = '12,000 monkeys on 100 typewriters for 53.12 days.'
lst = re.findall(pat, s)
for item in lst:
    print(item)
    

12,000
100
53.12


### GREEDY VERSUS NON-GREEDY MATCHING

In [60]:

pat = r'<.*>'
the_line = '<h1>This is an HTML heading.</h1>'
m = re.match(pat, the_line)
print(m.group()) # expected output is <h1>. ERROR


<h1>This is an HTML heading.</h1>


Greedy ( matches first and last strings ): <.*> <br>
Non Greedy ( stops matches when the last character matches) : <.*?> <br>

for examples: <br>
expr??   # Non-greedy zero-or-one matching<br>
expr*?   # Non-greedy zero-or-more matching<br>
expr+?   # Non-greedy one-or-more matching<br>

In [61]:
# non greedy

pat = r'<.*?>'     # Use NON-GREEDY matching!
the_line = '<h1>This is an HTML heading.</h1>'
m = re.match(pat, the_line)
print(m.group())

<h1>


In [62]:

s = r'''<h1>This is the first heading.</h1>
<h1>This is the second heading.</h1>
<b>This is in bold.</b>'''


In [64]:

pat = r'<.*?>'       # Notice use of NON-GREEDy  because of the ?.
lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'tags.')


There are 6 tags.


In [66]:
pat = r'<.*>'     # Notice use of GREEDY here!
lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'tags.') # this is  not a correct one !

There are 1 tags.


In [67]:

s = '''Here is a single sentence. Here is
 another sentence, ending in a period. And
 here is yet another.'''



In [73]:
# [chars]


pat = r'.*?[.?!]'  # Notice use of NON-GREEDY because of the first "?".

lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'sentences.')

There are 3 sentences.


### THE LOOK-AHEAD FEATURE

In [74]:
s = '''See the U.S.A. today. It's right here, not
 a world away. Average temp. is 66.5.'''

In [75]:

pat = r'[A-Z].*?[.!?](?= [A-Z]|$)'
m = re.findall(pat, s, flags=re.DOTALL | re.MULTILINE)


In [76]:
for i in m:
    print('->', i)

-> See the U.S.A. today.
-> It's right here, not
 a world away.
-> Average temp. is 66.5.


### CHECKING MULTIPLE PATTERNS (LOOK-AHEAD)

In [77]:

pat1 = r'(\w|[!@#$%^&*+-]){8,12}$'
pat2 = r'(?=.*[a-zA-Z])'      # Must include a letter.
pat3 = r'(?=.*\d)'            # Must include a digit.
pat4 = r'(?=.*[!@#$%^&*+-])'  # Must include punc. char.

pat = pat2 + pat3 + pat4 + pat1


In [84]:
passwd = 'harsaa3#'

if re.match(pat, passwd):
    print('It passed the test!')
else:
    print('Insufficiently strong password.')

It passed the test!


### NEGATIVE 

In [86]:

pat = r'abc(?!abc)'
s = 'The magic of abcabc.'
m = re.findall(pat, s)
print(m)


['abc']


In [87]:

pat = r'abc(?!abc)'
s = 'The magic of abcABC.'
m = re.findall(pat, s, flags=re.I)
print(m)



['ABC']


### NAMED GROUPS<br>
(?P<name>expr)  # Tags the matching group, using name.<br>
(?P=name)       # Attempt to match repeat of named group.

In [92]:
pat = r'(?P<first>\w+) (?P<last>\w+)'



In [89]:
s = 'Jane Austen'
m = re.match(pat, s)

In [90]:
print('first name = ', m.group('first'))
print('last name = ', m.group('last'))

first name =  Jane
last name =  Austen


In [91]:
print(m.group('last') + ', ' + m.group('first'))

Austen, Jane


In [93]:

pat = r'(?P<first>\w+) (?P<mid>\w\. )?(?P<last>\w+)'

def reorg_name(in_s):
    m = re.match(pat, in_s)
    s = m.group('last') + ', ' + m.group('first')
    if m.group('mid'):
        s += ' ' + m.group('mid')
    return s

In [95]:
reorg_name( "Jane Luck Austen")

'Luck, Jane'

### THE “RE.SPLIT” FUNCTION

In [98]:

pat = r', *| +'

lst = re.split(pat, '3, 5  7 8,10, 11')

lst

['3', '5', '7', '8', '10', '11']