### 7.2.2	태깅 문제 고치기

In [1]:
import re
pat = r'\d{1,3}(?:,\d{3})*(?:\.\d*)?\b'
s = '12,000 monkeys on 100 typewriters for 53.12 days.'
lst = re.findall(pat, s)
for item in lst:
    print(item)

12,000
100
53.12


In [2]:
regex1 = re.compile(r'\d{1,3}(?:,\d{3})*(?:\.\d*)?\b')
s = '12,000 monkeys on 100 typewriters for 53.12 days.'
lst = re.findall(regex1, s)
lst

['12,000', '100', '53.12']

## 7.3	탐욕적 vs 게으른 일치

In [3]:
import re
pat = r'<.*>'
the_line = '<h1>This is an HTML heading.</h1>'
m = re.match(pat, the_line)
print(m.group())

<h1>This is an HTML heading.</h1>


In [4]:
import re
pat = r'<.*?>' 					# 게으른 일치 사용!
the_line = '<h1>This is an HTML heading.</h1>'
m = re.match(pat, the_line)
print(m.group())

<h1>


In [5]:
s = r'''<h1>This is the first heading.</h1>
<h1>This is the second heading.</h1>
<b>This is in bold.</b>'''

In [6]:
pat = r'<.*?>' 				# ?를 사용하기 때문에
					# 게으른 일치를 사용하는 것이다.
lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'tags.')

There are 6 tags.


In [7]:
pat = r'<.*>' # Notice use of GREEDY here!
lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'tags.')

There are 1 tags.


In [8]:
s = '''Here is a single sentence. Here is
another sentence, ending in a period. And
here is yet another.'''

In [9]:
pat = r'.*?[.?!]' 			# 첫 "?"로 인해, 
					# 게으른 일치 사용.
lst = re.findall(pat, s, flags=re.DOTALL)
print('There are', len(lst), 'sentences.')

There are 3 sentences.


## 7.4 내다보기 기능

In [10]:
s = '''See the U.S.A. today. It's right here, not
a world away. Average temp. is 66.5.'''

In [11]:
import re
pat = r'[A-Z].*?[.!?](?= [A-Z]|$)'
m = re.findall(pat, s, flags=re.DOTALL | re.MULTILINE)

for i in m:
   print('->', i)

-> See the U.S.A. today.
-> It's right here, not
a world away.
-> Average temp. is 66.5.


In [12]:
s = '''To be or not to be. 
That is the question.
So says the Bard.
'''

pat = r'[A-Z].*?[.!?](?= [A-Z]|$)'
m = re.findall(pat, s, flags=re.DOTALL)

for i in m:
   print('->', i)

-> To be or not to be. 
That is the question.
So says the Bard.


In [13]:
s = "See the U.S.A. today. It's right here, not"
pat = r'[A-Z].*?[.!?] [A-Z]|$'
m = re.findall(pat, s, flags=re.DOTALL)
for i in m:
    print('->', i)

-> See the U.S.A. today. I
-> 


## 7.5	다중 패턴 확인하기 (내다보기)

In [14]:
pat1 = r'(\w|[!@#$%^&*+-]){8,12}$'
pat2 = r'(?=.*[a-zA-Z])' 		# 반드시 글자 포함
pat3 = r'(?=.*\d)' 			# 반드시 숫자 포함
pat4 = r'(?=.*[!@#$%^&*+-])' 		# 반드시 구두점 문자 포함

pat = pat2 + pat3 + pat4 + pat1

In [15]:
import re
passwd = 'HenryThe5!'
if re.match(pat, passwd):
    print('It passed the test!')
else:
    print('Insufficiently strong password.')

It passed the test!


## 7.6 부정적 내다보기

In [16]:
import re
pat = r'abc(?!abc)'
s = 'The magic of abcabc.'
m = re.findall(pat, s)
print(m)

['abc']


In [17]:
pat = r'abc(?!abc)'
s = 'The magic of abcABC.'
m = re.findall(pat, s, flags=re.I)
print(m)

['ABC']


In [18]:
s = '''See the U.S.A. today. It's right here, not
 a world away. Average temp. is 70.5.'''

In [19]:
import re 	# 소스 파일에 아직 넣지 않은 경우에만,
		# 이 문장을 추가한다.

pat = r'[A-Z].*?[.!?](?! [a-z]|\w)'
s = '''See the U.S.A. today. It's right here, not
 a world away. Average temp. is 70.5. It's fun!'''
m = re.findall(pat, s, flags=re.DOTALL)
for i in m:
    print('->', i)

-> See the U.S.A. today.
-> It's right here, not
 a world away.
-> Average temp. is 70.5.
-> It's fun!


In [20]:
s = re.sub(r'\n', '', s)
m = re.findall(pat, s, flags=re.DOTALL)
for i in m:
    print('->', i)

-> See the U.S.A. today.
-> It's right here, not a world away.
-> Average temp. is 70.5.
-> It's fun!


## 7.7	명명 그룹

In [21]:
pat = r'(?P<first>\w+) (?P<last>\w+)'

In [22]:
import re
s = 'Jane Austen'
m = re.match(pat, s)

In [23]:
print('first name = ', m.group('first'))
print('last name = ', m.group('last'))

first name =  Jane
last name =  Austen


In [24]:
print(m.group('last') + ', ' + m.group('first'))

Austen, Jane


In [25]:
pat = r'(?P<first>\w+) (?P<mid>\w\. )?(?P<last>\w+)'

def reorg_name(in_s):
    m = re.match(pat, in_s)
    s = m.group('last') + ', ' + m.group('first')
    if m.group('mid'):
        s += ' ' + m.group('mid')
    return s

In [26]:
pat = r'(?P<word>\w+) (?P=word)'
m = re.search(pat, 'The the dog.', flags=re.I)
m.group(0)

'The the'

## 7.8 re.split 함수

In [27]:
pat = r', *| +'

In [28]:
import re
lst = re.split(pat, '3, 5 7 8,10, 11')
lst

['3', '5', '7', '8', '10', '11']

In [29]:
s = '3 2 * 2 15 * + 4 +'
toks = re.split(pat, s)
toks

['3', '2', '*', '2', '15', '*', '+', '4', '+']

## 7.9 스캐너 클래스와 RPN 프로젝트

In [30]:
import re

def sc_oper(scanner, tok): return tok
def sc_int(scanner, tok): return int(tok)
def sc_float(scanner, tok): return float(tok)

scanner = re.Scanner ([
    (r'[*+/-]', sc_oper),
    (r'\d+\.\d*', sc_float),
    (r'\d+', sc_int),
    (r'\s+', None)
    ])

In [31]:
print(scanner.scan('3 3+'))

([3, 3, '+'], '')


In [32]:
print(scanner.scan('32 6.67+ 10 5- *'))

([32, 6.67, '+', 10, 5, '-', '*'], '')


## 7.10 RPN: 스캐너로 더 많은 작업 수행하기

In [33]:
import re

scanner = re.Scanner ([
    (r'[*+/-]', sc_oper),
    (r'\d+\.\d*', sc_float),
    (r'\d+', sc_int),
    (r'\s+', None)
    ])

In [34]:
scanner = re.Scanner ([
    (r'[*+/-]', lambda s, t: bin_op(t)),
    (r'\d+\.\d*', lambda s, t: the_stk.append(float(t))),
    (r'\d+', lambda s, t: the_stk.append(int(t))),
    (r'\s+', None)
    ])

def bin_op(tok):
    op2, op1 = the_stk.pop(), the_stk.pop()
    if tok == '+':
        the_stk.append(op1 + op2)
    elif tok == '*':
        the_stk.append(op1 * op2)
    elif tok == '/':
        the_stk.append(op1 / op2)
    elif tok == '-':
        the_stk.append(op1 - op2)

In [35]:
# File scanner_rpn.py -------------------------------

import re

the_stk = [ ]

scanner = re.Scanner ([
    (r'[*+/-]', lambda s, t: bin_op(t)),
    (r'\d+\.\d*', lambda s, t: the_stk.append(float(t))),
    (r'\d+', lambda s, t: the_stk.append(int(t))),
    (r'\s+', None)
    ])

def bin_op(tok):
    op2, op1 = the_stk.pop(), the_stk.pop()
    if tok == '+':
        the_stk.append(op1 + op2)
    elif tok == '*':
        the_stk.append(op1 * op2)
    elif tok == '/':
        the_stk.append(op1 / op2)
    elif tok == '-':
        the_stk.append(op1 - op2)

def main():
    input_str = input('Enter RPN string: ')
    tokens, unknown = scanner.scan(input_str)
    if unknown:
        print('Unrecognized input:', unknown)
    else:
        print('Answer is', the_stk.pop())

main()

Enter RPN string: 32 6.67+ 10 5- *
Answer is 193.35000000000002


In [36]:
# File scanner_rpn2.py ------------------------------

import re
import operator

the_stk = [ ]

scanner = re.Scanner ([
    (r'[+]', lambda s, t: bin_op(operator.add)),
    (r'[*]', lambda s, t: bin_op(operator.mul)),
    (r'[-]', lambda s, t: bin_op(operator.sub)),
    (r'[/]', lambda s, t: bin_op(operator.truediv)),
    (r'\d+\.\d*', lambda s, t: the_stk.append(float(t))),
    (r'\d+', lambda s, t: the_stk.append(int(t))),
    (r'\s+', None)
    ])

def bin_op(oper):
    op2, op1 = the_stk.pop(), the_stk.pop()
    the_stk.append(oper(op1, op2))

def main():
    input_str = input('Enter RPN string: ')
    tokens, unknown = scanner.scan(input_str)
    if unknown:
        print('Unrecognized input:', unknown)
    else:
        print('Answer is', the_stk.pop())

main()

Enter RPN string: 32 6.67+ 10 5- *
Answer is 193.35000000000002
