## 正則表達式的貪婪與非貪婪模式 



In [5]:
import re

## 貪婪模式
print(re.search('go*', 'goooooood').group()) ## 'gooooooo'

## 非貪婪模式
print(re.search('go*?', 'goooooood').group()) ## 'g'



gooooooo
g


'a_b*c de'

## 非貪婪模式的常見用法  .*？的用法介紹 

In [6]:
import re

re.search('.*?e','a_b*c defg').group()

'a_b*c de'

##  match 函數用法 

In [7]:
import re

text = 'https://matters.news/@CHWang'
text1 = 'Matters.news'


print(re.match('https', text))
print(re.match('https', text).span())
print(re.match('matters', text))
print(re.match('matters', text1))
print(re.match('matters', text1, flags = re.I))

<re.Match object; span=(0, 5), match='https'>
(0, 5)
None
None
<re.Match object; span=(0, 7), match='Matters'>


In [13]:
import re


text = 'Jack lives in HsinChu and he is 25 years old, but ...'

match_result = re.match(r'(.*) lives in ([a-z]*) and he is (\d+).*', text, re.I)
                        
                        
print(match_result.group())
print(match_result.group(1))
print(match_result.group(2))
print(match_result.group(3))

print(type(match_result.groups()))
print(match_result.groups())

Jack lives in HsinChu and he is 25 years old, but ...
Jack
HsinChu
25
<class 'tuple'>
('Jack', 'HsinChu', '25')


## Search 函數用法 

In [39]:
import re

text = 'https://medium.com/@chwang12341'
text1 = 'Medium.Com'

print(re.search('https://', text))
print(re.search('dium', text))
print(re.search('medium', text).span())

print(re.search('co',text1))
print(re.search('co',text1, flags = re.I).span())

<re.Match object; span=(0, 8), match='https://'>
<re.Match object; span=(10, 14), match='dium'>
(8, 14)
None
(7, 9)


In [49]:
import re

text = 'Jen likes to eat cake and drink coke, but ...'

match_result = re.search('(.*) likes to eat (\w+) and drink ([a-z]*)', text, re.I|re.M)

print(match_result.group())
print(match_result.group(1))
print(match_result.group(2))
print(match_result.group(3))

print(match_result.groups())

Jen likes to eat cake and drink coke
Jen
cake
coke
('Jen', 'cake', 'coke')


## findall 函數用法

In [57]:
import re

find_pattern = re.compile(r'[a-z]+', re.I)

match_result1 = find_pattern.findall('good 66 day Tom_28 Yep')
match_result2 = find_pattern.findall('good98MMorning66 Jen666 Yeah', 6,20)

print(match_result1)
print(match_result2)

['good', 'day', 'Tom', 'Yep']
['MMorning', 'Jen']


## sub 函數用法 



In [70]:
import re 

text = 'Jack/25/1993 and Jen/23/1995'

## 把中間的and與空格拿掉，用&替換
sub_result1 = re.sub('\sand\s', '&', text)
print(sub_result1)

## 狀況一: 再把/拿掉
sub_result2 = re.sub('/', '', sub_result1)
print(sub_result2)

## 狀況二: 再把/拿掉，但只要拿掉前兩個
sub_result3 = re.sub('/', '', sub_result1, 2)
print(sub_result3)

Jack/25/1993&Jen/23/1995
Jack251993&Jen231995
Jack251993&Jen/23/1995


In [110]:
import re

text = 'Jack66Jen58Ken28,Cathy38'

## 將匹配好的數字做平方計算
def square(match_result):
    num = int(match_result.group('number'))
    
    return str(num**2)

## 給定我們匹配值一個名稱，用?P<name>
final_result = re.sub('(?P<number>\d+)', square, text)

print(final_result)

Jack4356Jen3364Ken784,Cathy1444


## Compile 函數

In [158]:
import re

text = '68Jack66Jen58Ken28,Cathy38'

## 匹配字母，並忽略大小寫
pattern = re.compile(r'([a-z]+)', re.I)

## match預設從第一個位置開始匹配
compile_result1 = pattern.match(text)
print(compile_result1) ## None，因為match會從第一個位置開始匹配，如果不通過就會返回none

## 從第3個位置開始匹配
compile_result2 = pattern.match(text, 2, 20)
print(compile_result2) 


print(compile_result2.group(0)) 
print(compile_result2.start(0))
print(compile_result2.end(0))
print(compile_result2.span()) 

None
<re.Match object; span=(2, 6), match='Jack'>
Jack
2
6
(2, 6)


## finditer 函數用法 

In [163]:
import re

match_result = re.finditer(r'[a-z]+', '68Jack66Jen58Ken28,Cathy38', re.I)


for name in match_result:
#     print(name)
    print(name.group())

Jack
Jen
Ken
Cathy


## split 函數用法 

In [182]:
#### import re

text = 'Jack66Jen58Ken28Cathy'

## 用數字來做為分隔依據
print(re.split('\d+', text))

## 分隔，並將數字也傳進陣列
print(re.split('(\d+)', text))

## 如果匹配的一句剛好在前後的位置，就會傳回空值
text1 = '66Jack66Jen58Ken28Cathy38'
print(re.split('\d+', text1))

## 如果找不到匹配會回串全部字串
print(re.split('\s+', text1))

['Jack', 'Jen', 'Ken', 'Cathy']
['Jack', '66', 'Jen', '58', 'Ken', '28', 'Cathy']
['', 'Jack', 'Jen', 'Ken', 'Cathy', '']
['66Jack66Jen58Ken28Cathy38']


## 重要筆記：匹配時將我們需要爬取的數據，用（）來包住它的匹配規則，才會被獨立出來放入串列 

In [197]:
import re

text = '6658|Example_format|2020'

print(re.findall(r'(\d+)(?:\WExample_format\W)(\d+)', text))

[('6658', '2020')]
