# Python 正则表达 RegEx

# https://note.artchiu.org/2015/01/23/%e6%af%94%e8%bc%83%e8%a9%b3%e7%b4%b0-python-%e6%ad%a3%e5%89%87%e8%a1%a8%e9%81%94%e5%bc%8f%e6%93%8d%e4%bd%9c%e6%8c%87%e5%8d%97-re%e4%bd%bf%e7%94%a8/

## 导入模块

In [1]:
import re

## 简单 Python 匹配

In [3]:
# matching string
pattern1 = "cat"
pattern2 = "bird"
string = "dog runs to cat"
print(pattern1 in string)    
print(pattern2 in string)    

True
False


## 用正则寻找配对

In [4]:
# regular expression
pattern1 = "cat"
pattern2 = "bird"
string = "dog runs to cat"
print(re.search(pattern1, string))  
print(re.search(pattern2, string)) 

<re.Match object; span=(12, 15), match='cat'>
None


## 匹配多种可能 使用 []

In [5]:
# multiple patterns ("run" or "ran")
ptn = r"r[au]n"       
print(re.search(ptn, "dog runs to cat"))    

<re.Match object; span=(4, 7), match='run'>


## 匹配更多种可能

In [6]:
# continue
print(re.search(r"r[A-Z]n", "dog runs to cat"))     
print(re.search(r"r[a-z]n", "dog runs to cat"))     
print(re.search(r"r[0-9]n", "dog r2ns to cat"))     
print(re.search(r"r[0-9a-z]n", "dog runs to cat"))  

None
<re.Match object; span=(4, 7), match='run'>
<re.Match object; span=(4, 7), match='r2n'>
<re.Match object; span=(4, 7), match='run'>


## 特殊种类匹配

### 数字

In [7]:
# \d : decimal digit
print(re.search(r"r\dn", "run r4n"))                
# \D : any non-decimal digit
print(re.search(r"r\Dn", "run r4n"))                


<re.Match object; span=(4, 7), match='r4n'>
<re.Match object; span=(0, 3), match='run'>


### 空白

In [8]:
# \s : any white space [\t\n\r\f\v]
print(re.search(r"r\sn", "r\nn r4n").group())               
# \S : opposite to \s, any non-white space
print(re.search(r"r\Sn", "r\nn r4n"))               


r
n
<re.Match object; span=(4, 7), match='r4n'>


### 所有字母数字和"_"

In [9]:
# \w : [a-zA-Z0-9_]
print(re.search(r"r\wn", "r\nn r4n"))               
# \W : opposite to \w
print(re.search(r"r\Wn", "r\nn r4n"))               


<re.Match object; span=(4, 7), match='r4n'>
<re.Match object; span=(0, 3), match='r\nn'>


### 空白字符

In [10]:
# \b : empty string (only at the start or end of the word)
print(re.search(r"\bruns\b", "dog runs to cat").group())    
# \B : empty string (but not at the start or end of a word)
print(re.search(r"\B runs \B", "dog   runs  to cat"))  


runs
<re.Match object; span=(5, 11), match=' runs '>


### 特殊字符 任意字符

In [11]:
# \\ : match \
print(re.search(r"runs\\", "runs\ to me"))          
# . : match anything (except \n)
print(re.search(r"r.n", "r[ns to me"))              


<re.Match object; span=(0, 5), match='runs\\'>
<re.Match object; span=(0, 3), match='r[n'>


### 句尾句首

In [12]:
# ^ : match line beginning
print(re.search(r"^dog", "dog runs to cat"))        
# $ : match line ending
print(re.search(r"cat$", "dog runs to cat"))       


<re.Match object; span=(0, 3), match='dog'>
<re.Match object; span=(12, 15), match='cat'>


### 是否

In [13]:
# ? : may or may not occur
print(re.search(r"Mon(day)?", "Monday"))            
print(re.search(r"Mon(day)?", "Mon"))               

<re.Match object; span=(0, 6), match='Monday'>
<re.Match object; span=(0, 3), match='Mon'>


## 多行匹配

In [29]:
# multi-line
string = """
dog runs to cat.
I run to dog.
"""
print(re.search(r"^I", string))                     
print(re.search(r"^I", string, flags=re.M)) 

None
<re.Match object; span=(18, 19), match='I'>


# 邊界（用來找獨立的一個詞，而不是包含在其它詞中的字母）

In [80]:
p = re.compile(r'\bclass\b') 
print(p.findall('class class class123 123class at class'))

['class', 'class', 'class']


# 非邊界（用來找非獨立的一個詞，包含在其它詞中的字母）

In [41]:
p = re.compile(r'\Bclass\B') 
print(p.search('no class class123 123class123 at all'))

<re.Match object; span=(21, 26), match='class'>


## 0或多次

In [15]:
# * : occur 0 or more times
print(re.search(r"ab*", "a"))                       
print(re.search(r"ab*", "abbbbb"))                  


<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 6), match='abbbbb'>


## 1或多次

In [16]:
# + : occur 1 or more times
print(re.search(r"ab+", "a"))                       
print(re.search(r"ab+", "abbbbb"))                  


None
<re.Match object; span=(0, 6), match='abbbbb'>


## 可选次数

In [17]:
# {n, m} : occur n to m times
print(re.search(r"ab{2,10}", "a"))                  
print(re.search(r"ab{2,10}", "abbbbb"))             


None
<re.Match object; span=(0, 6), match='abbbbb'>


## group 组

In [18]:
# group
match = re.search(r"(\d+), Date: (.+)", "ID: 021523, Date: Feb/12/2017")
print(match.group())                                
print(match.group(1))                               
print(match.group(2))                               

021523, Date: Feb/12/2017
021523
Feb/12/2017


In [19]:
match = re.search(r"(?P<id>\d+), Date: (?P<date>.+)", "ID: 021523, Date: Feb/12/2017")
print(match.group('id'))                            
print(match.group('date'))                          

021523
Feb/12/2017


## 寻找所有匹配 

In [20]:
# findall
print(re.findall(r"r[ua]n", "run ran ren"))         

['run', 'ran']


In [21]:
# | : or
print(re.findall(r"(run|ran)", "run ran ren"))      

['run', 'ran']


## 替换

In [3]:
# re.sub() replace
re.sub(r"r[au]ns", "1", "dog runs to cat rans")

'dog 1 to cat 1'

## 分裂

In [10]:
# method1
print(re.split(r"/", "a/wwt/wte", maxsplit=1))             


['a', 'wwt/wte']


In [12]:
# method2
string = 'a b c d'
string.rsplit(' ', 1)

['a b c', 'd']

## compile

In [24]:
# compile
compiled_re = re.compile(r"r[ua]n")
print(compiled_re.search("dog ran to cat"))     

<re.Match object; span=(4, 7), match='ran'>


# 大小寫

In [4]:
# compile
compiled_re = re.compile(r"r[ua]n",re.I)
print(compiled_re.search("dog rAn to cat"))    
# re.search('ran', 'rAn to dog',re.I)

print(re.search("idea", "IdeA", re.I).group())

<re.Match object; span=(4, 7), match='rAn'>
IdeA


# 逐行搜尋

In [16]:
re.search("a", "wkofsw \n sfda", re.M)

<re.Match object; span=(5, 6), match='a'>

# 使 "." 特殊字符完全匹配任何字符，包括換行；沒有這個標誌， "." 匹配除了換行外的任何字符。

In [19]:
re.search(".*", "wkofsw \n \b sfda", re.S)

<re.Match object; span=(0, 15), match='wkofsw \n \x08 sfda'>

# 加入註解

In [28]:
charref = re.compile(r"""
123          # Start of a numeric entity reference
[0-9]+      # Decimal form
|456
""", re.VERBOSE)
charref.search('456')

<re.Match object; span=(0, 3), match='456'>

# 特定詞之後或之前

In [49]:
re.search("(?<=\s)[a-z]+", "roger federer") #前面有空格的詞

<re.Match object; span=(6, 13), match='federer'>

In [50]:
re.search("[a-z]+(?=\s)", "roger federer") #後面有空格的詞

<re.Match object; span=(0, 5), match='roger'>

# 不在特定詞之後或之前

In [88]:
re.findall("[a-z]+(?!\s)", "roger federer") #後面not空格的詞

['roge', 'federer']

In [83]:
re.findall("(?<!\s)[a-z]+", "roger federer") #前面not空格的詞

['roger', 'ederer']

# URL

In [52]:
tt='http://zh.wikipedia.org/wiki/%E6%B3%A2%E7%89%B9%E7%8E%87'
# if re.search('(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@i',tt) is not None:
#     print('yes')

re.search('[a-zA-z]+://[^\s]*',tt).group()

'http://zh.wikipedia.org/wiki/%E6%B3%A2%E7%89%B9%E7%8E%87'

In [28]:
tt = '中的'
re.match('^[\u2E80-\u9FFF]{3}\Z',tt)

In [29]:
if re.match('^[\u2E80-\u9FFF]{2}\Z',tt) or re.match('^[\u2E80-\u9FFF]{3}\Z',tt) is not None:
    print('yes')

yes


# compile

In [None]:
# compile
compiled_re = re.compile(r"r[ua]n")
print(compiled_re.search("dog ran to cat"))  