### 基础匹配

In [1]:
import re

# 简单匹配
text = "Hello World"
result = re.search(r"World", text)
if result:
    print("找到:", result.group())  # World

# 检查是否匹配
match = re.match(r"Hello", text)
if match:
    print("匹配成功")  # 匹配成功

找到: World
匹配成功


### 字符匹配

In [2]:
import re

# . 匹配任意字符（除了换行）
print(re.findall(r"h.", "hi hello hey"))  # ['hi', 'he', 'he']

# [] 匹配字符集合
print(re.findall(r"[aeiou]", "hello world"))  # ['e', 'o', 'o']

# [^] 不匹配字符集合
print(re.findall(r"[^aeiou]", "hello"))  # ['h', 'l', 'l']

# \d 数字 \D 非数字
print(re.findall(r"\d", "2024年1月15日"))  # ['2', '0', '2', '4', '1', '1', '5']

# \w 单词字符 \W 非单词字符
print(re.findall(r"\w+", "Hello_World 2024!"))  # ['Hello_World', '2024']

# \s 空白字符 \S 非空白字符
print(re.findall(r"\s+", "a b  c   d"))  # [' ', '  ', '   ']

['hi', 'he', 'he']
['e', 'o', 'o']
['h', 'l', 'l']
['2', '0', '2', '4', '1', '1', '5']
['Hello_World', '2024']
[' ', '  ', '   ']


### 数量限定

In [3]:
import re

text = "goooood goood good god"

# * 0次或多次
print(re.findall(r"go*d", text))  # ['goooood', 'goood', 'god']

# + 1次或多次
print(re.findall(r"go+d", text))  # ['goooood', 'goood', 'god']

# ? 0次或1次
print(re.findall(r"goo?d", text))  # ['good', 'god']

# {n} 正好n次
print(re.findall(r"o{2}", "food moon"))  # ['oo', 'oo']

# {n,} 至少n次
print(re.findall(r"o{2,}", "food mooon"))  # ['oo', 'ooo']

# {n,m} n到m次
print(re.findall(r"o{1,3}", "foooood"))  # ['ooo', 'oo']

['goooood', 'goood', 'good', 'god']
['goooood', 'goood', 'good', 'god']
['good', 'god']
['oo', 'oo']
['oo', 'ooo']
['ooo', 'oo']


### 常用函数

#### findall - 查找所有

In [4]:
import re

text = "电话: 138-1234-5678, 139-8765-4321"
phones = re.findall(r"\d{3}-\d{4}-\d{4}", text)
print(phones)  # ['138-1234-5678', '139-8765-4321']

['138-1234-5678', '139-8765-4321']


#### search - 查找第一个

In [5]:
import re

text = "价格: $19.99 折扣价: $9.99"
match = re.search(r"\$\d+\.\d{2}", text)
if match:
    print("找到价格:", match.group())  # $19.99

找到价格: $19.99


#### match - 从头匹配

In [6]:
import re

text = "2024-01-15 今天天气很好"
match = re.match(r"\d{4}-\d{2}-\d{2}", text)
if match:
    print("日期:", match.group())  # 2024-01-15

日期: 2024-01-15


#### sub - 替换

In [7]:
import re

text = "我的电话是 13800138000，备用 13900139000"
# 隐藏手机号中间4位
new_text = re.sub(r"(\d{3})\d{4}(\d{4})", r"\1****\2", text)
print(new_text)  # 我的电话是 138****8000，备用 139****9000

我的电话是 138****8000，备用 139****9000


#### split - 分割

In [8]:
import re

text = "苹果,香蕉;橙子 西瓜|葡萄"
items = re.split(r"[,;\s|]+", text)
print(items)  # ['苹果', '香蕉', '橙子', '西瓜', '葡萄']

['苹果', '香蕉', '橙子', '西瓜', '葡萄']


### 分组捕获

In [9]:
import re

# 基本分组
text = "2024-01-15"
match = re.match(r"(\d{4})-(\d{2})-(\d{2})", text)
if match:
    print("年:", match.group(1))  # 2024
    print("月:", match.group(2))  # 01
    print("日:", match.group(3))  # 15
    print("全部:", match.group(0))  # 2024-01-15

# 命名分组
text = "姓名: 张三, 年龄: 25"
match = re.search(r"姓名: (?P<name>\w+), 年龄: (?P<age>\d+)", text)
if match:
    print(match.group("name"))  # 张三
    print(match.group("age"))   # 25

年: 2024
月: 01
日: 15
全部: 2024-01-15
张三
25


### 实用案例

#### 邮箱验证

In [10]:
import re

def validate_email(email):
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))

emails = ["test@example.com", "invalid.email", "user@domain.co.uk"]
for email in emails:
    print(f"{email}: {validate_email(email)}")

test@example.com: True
invalid.email: False
user@domain.co.uk: True


#### 手机号提取

In [11]:
import re

def extract_phones(text):
    pattern = r"1[3-9]\d{9}"  # 简单手机号匹配
    return re.findall(pattern, text)

text = "联系我: 13800138000 或 13900139000，座机: 010-12345678"
phones = extract_phones(text)
print("手机号:", phones)  # ['13800138000', '13900139000']

手机号: ['13800138000', '13900139000']


#### HTML标签内容提取

In [12]:
import re

html = "<h1>标题</h1><p>段落内容</p><div>更多内容</div>"
titles = re.findall(r"<h1>(.*?)</h1>", html)
contents = re.findall(r"<[^>]+>(.*?)</[^>]+>", html)

print("标题:", titles)      # ['标题']
print("所有内容:", contents)  # ['标题', '段落内容', '更多内容']

标题: ['标题']
所有内容: ['标题', '段落内容', '更多内容']


#### 密码强度检查

In [13]:
import re

def check_password(password):
    if len(password) < 8:
        return "密码太短"
    
    checks = [
        (r"[A-Z]", "需要大写字母"),
        (r"[a-z]", "需要小写字母"),
        (r"\d", "需要数字"),
        (r"[!@#$%^&*()]", "需要特殊字符")
    ]
    
    for pattern, message in checks:
        if not re.search(pattern, password):
            return message
    
    return "密码强度足够"

print(check_password("Pass123!"))    # 密码强度足够
print(check_password("weak"))        # 密码太短
print(check_password("nopassword"))  # 需要大写字母

密码强度足够
密码太短
需要大写字母


#### 日志分析

In [14]:
import re

log = """
2024-01-15 10:30:25 INFO 用户登录成功
2024-01-15 10:31:00 ERROR 数据库连接失败
2024-01-15 10:32:15 WARN 内存使用率80%
"""

# 提取错误日志
errors = re.findall(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} ERROR (.+)", log)
print("错误信息:", errors)  # ['数据库连接失败']

# 提取所有日志级别
levels = re.findall(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (\w+) ", log)
print("日志级别:", levels)  # ['INFO', 'ERROR', 'WARN']

错误信息: ['数据库连接失败']
日志级别: ['INFO', 'ERROR', 'WARN']
