## 学习要点

1、Matcher对象的创建和使用

2、匹配模式的定义技巧

In [1]:
# 导入必要的库
# 导入 spaCy 库
import spacy
# 导入 Matcher 类，用于创建自定义的匹配规则。
from spacy.matcher import Matcher

In [2]:
# 加载英文语言模型
nlp = spacy.load("en_core_web_sm")

In [3]:
# 初始化匹配器
matcher = Matcher(nlp.vocab)

In [4]:
# 定义一个匹配模式，用于查找类似电子邮件地址的文本。LIKE_EMAIL 是 spaCy 预定义的一个标记，表示类似电子邮件地址的文本。
pattern = [{"LIKE_EMAIL": True}]

In [5]:
# 将定义好的匹配模式添加到匹配器中，并将其命名为 "EMAIL_ADDRESS"。
matcher.add("EMAIL_ADDRESS", [pattern])

In [6]:
# 创建一个待处理的文本
doc = nlp("This is an email address: abc@gmail.com")

In [7]:
# 使用匹配器查找文本中的匹配项
matches = matcher(doc)

In [8]:
# 打印匹配结果
matches

[(16571425990740197027, 6, 7)]

In [9]:
print(doc[6:7])

abc@gmail.com


In [10]:
# 通过哈希值(matches[0][0])从词汇表中获取匹配规则的名称。
# matches[0][0]为匹配规则的唯一标识符（ID）
nlp.vocab[matches[0][0]].text

'EMAIL_ADDRESS'

In [11]:
# 读取文本文件
with open("data/wiki_mlk.txt", "r") as f:
    text = f.read()

In [12]:
# 打印文本内容
text

'Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.\n\nKing participated in and led marches for blacks\' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his fam

In [13]:
# 重新加载英语语言模型（这行可能是多余的，因为之前已经加载过了）
nlp = spacy.load("en_core_web_sm")

In [14]:
# 定义一个新的matcher模式来匹配专有名词
# 初始化匹配器
matcher = Matcher(nlp.vocab)
# 定义匹配模式，查找专有名词
# "POS" 是 spaCy 用来表示词性标签的属性。
# "PROPN" 是 spaCy 用来表示专有名词的标准词性标签。
pattern = [{"POS": "PROPN"}]
# "PROPER_NOUN" 是给这个匹配规则起的名字，这是完全自定义的。
# 将匹配模式添加到匹配器中，命名为"PROPER_NOUN"
matcher.add("PROPER_NOUN", [pattern])
# 对文本进行处理
doc = nlp(text)
# 使用匹配器查找文本中的匹配项
matches = matcher(doc)
# 打印匹配项的数量
print(len(matches))
# 遍历前10个匹配项，并打印匹配信息和匹配到的文本
for match in matches[:10]:
    # 每一行都代表一个匹配项,格式为 (哈希值, 开始索引, 结束索引) 匹配的文本
    print(match, doc[match[1]:match[2]]) 

102
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 23, 24) Baptist


In [15]:
# 初始化Matcher，并传入nlp的词汇表
matcher = Matcher(nlp.vocab)
# 定义匹配模式：一个或多个专有名词
# {"POS": "PROPN"}:这部分和之前一样,匹配词性(POS)为专有名词(PROPN)的token
# "OP": "+":这是新添加的部分,代表"操作符"(operator)。
# "+" 是一个量词操作符,其含义来自正则表达式。
# 操作符 "+" 的具体作用:
# 它允许匹配模式重复一次或多次。
# 这意味着这个规则可以匹配单个专有名词,也可以匹配连续的多个专有名词。
pattern = [{"POS": "PROPN", "OP": "+"}]
# 将模式添加到Matcher，并命名为"PROPER_NOUN"
matcher.add("PROPER_NOUN", [pattern])
# 使用nlp处理文本，生成doc对象
doc = nlp(text)
# 在doc中应用Matcher，获取所有匹配结果
matches = matcher(doc)
# 打印匹配到的总数量
print(len(matches))
# 遍历前10个匹配项并打印匹配编号及对应的文本片段
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

175
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [16]:
# 初始化匹配器
matcher = Matcher(nlp.vocab) 
# 定义匹配模式：查找一个或多个连续的专有名词
pattern = [{"POS": "PROPN", "OP": "+"}]  
# 将匹配模式添加到匹配器，命名为 "PROPER_NOUN"，并设置为贪婪匹配最长的序列
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST") 
# 使用nlp模型处理文本
doc = nlp(text)
# 使用匹配器查找文本中的匹配项
matches = matcher(doc) 
# 打印匹配到的专有名词序列数量
print(len(matches))

# 遍历前10个匹配项，打印匹配信息和匹配到的文本
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

61
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 469, 474) Martin Luther King Jr. Day
(451313080118390996, 536, 541) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 128, 132) Southern Christian Leadership Conference
(451313080118390996, 247, 251) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 325, 328) Nobel Peace Prize
(451313080118390996, 422, 425) James Earl Ray
(451313080118390996, 463, 466) Congressional Gold Medal


In [17]:
# 初始化匹配器
matcher = Matcher(nlp.vocab) 
# 定义匹配模式：查找一个或多个连续的专有名词
pattern = [{"POS": "PROPN", "OP": "+"}]  
# 将匹配模式添加到匹配器，命名为 "PROPER_NOUN"，并设置为贪婪匹配最长的序列
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")  
# 使用nlp模型处理文本
doc = nlp(text)
# 使用匹配器查找文本中的匹配项
matches = matcher(doc) 
# 对匹配结果按照起始位置排序
# 按照匹配项的开始位置（x[1]）对结果进行排序。
# 结果是匹配项会按照它们在原文中出现的顺序排列，而不是按照匹配器找到它们的顺序。
matches.sort(key=lambda x: x[1]) 
# 打印匹配到的专有名词序列数量
print(len(matches)) 
# 遍历前10个匹配项，打印匹配信息和匹配到的文本
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

61
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 23, 24) Baptist
(451313080118390996, 49, 50) King
(451313080118390996, 69, 71) Mahatma Gandhi
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 89, 90) King
(451313080118390996, 113, 114) King


In [18]:
# 创建一个Matcher对象，使用nlp.vocab作为词汇表
matcher = Matcher(nlp.vocab)
# 定义一个匹配模式，匹配一个或多个专有名词后跟一个动词
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]
# 将模式添加到matcher对象，命名为"PROPER_NOUN"，使用贪婪匹配，选择最长的匹配项
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
# 使用nlp对象处理文本
doc = nlp(text)
# 使用matcher对象匹配文本
matches = matcher(doc)
# 对匹配结果按起始位置排序
matches.sort(key = lambda x: x[1])
# 打印匹配结果的数量
print(len(matches))
# 循环打印前10个匹配结果
for match in matches[:10]:
  # 打印匹配结果的id、起始位置和结束位置，以及匹配到的文本
  print(match, doc[match[1]:match[2]])

7
(451313080118390996, 49, 51) King advanced
(451313080118390996, 89, 91) King participated
(451313080118390996, 113, 115) King led
(451313080118390996, 167, 169) King helped
(451313080118390996, 247, 252) Director J. Edgar Hoover considered
(451313080118390996, 322, 324) King won
(451313080118390996, 485, 488) United States beginning


In [19]:
# 导入json库
import json
# 打开名为"data/alice.json"的文件，以只读模式打开
with open("data/alice.json", "r") as f:
    # 使用json.load()函数加载文件内容到变量data
    data=json.load(f)

In [20]:
# 从data中提取文本内容，赋值给变量text
text = data[0][2][0]

In [21]:
# 打印text的内容
text

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'"

In [22]:
# 将text中的"`"替换成"'"
text= text.replace("`", "'")

# 打印text的内容
text

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'"

In [23]:
# 初始化Matcher对象
matcher = Matcher(nlp.vocab)
# 定义匹配模式，寻找被单引号括起来的，包含至少一个字母和任意数量的标点符号的字符串
pattern = [{"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"}
          ]
# 将模式命名为"PROPER_NOUN"，添加到matcher对象中，设置greedy为"LONGEST"以优先匹配最长的字符串
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
# 使用nlp对象处理文本text
doc = nlp(text)
# 使用matcher对象匹配doc中的文本
matches = matcher(doc)
# 对匹配结果按照起始位置排序
matches.sort(key = lambda x: x[1])
# 打印匹配结果的数量
print(len(matches))
# 打印前10个匹配结果，包括匹配的span和对应的文本
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

2
(451313080118390996, 47, 58) 'and what is the use of a book,'
(451313080118390996, 60, 67) 'without pictures or conversation?'


In [24]:
# 定义speak_lemmas列表，包含"think"和"say"
speak_lemmas = ["think", "say"]
# 初始化Matcher对象
matcher = Matcher(nlp.vocab)
# 定义匹配模式，寻找被单引号括起来的字符串，后面跟着speak_lemmas中的动词，然后是一个或多个专有名词，再后面是被单引号括起来的字符串
pattern = [{"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"},
           {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
           {"POS": "PROPN", "OP": "+"},
           {"ORTH": "'"},
           {"IS_ALPHA": True, "OP": "+"},
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"}
          ]
# 将模式命名为"PROPER_NOUN"，添加到matcher对象中，设置greedy为"LONGEST"以优先匹配最长的字符串
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
# 使用nlp对象处理文本text
doc = nlp(text)
# 使用matcher对象匹配doc中的文本
matches = matcher(doc)
# 对匹配结果按照起始位置排序
matches.sort(key = lambda x: x[1])
# 打印匹配结果的数量
print(len(matches))
# 打印前10个匹配结果，包括匹配的span和对应的文本
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [25]:
# 遍历data[0][2]中的每个文本
for text in data[0][2]:
    # 将文本中的"`"替换成"'"
    text = text.replace("`", "'")
    # 使用nlp对象处理文本
    doc = nlp(text)
    # 使用matcher对象匹配文本
    matches = matcher(doc)
    # 打印匹配结果的数量
    print(len(matches))
    # 对匹配结果按照起始位置排序
    matches.sort(key = lambda x: x[1])
    # 打印前10个匹配结果，包括匹配的span和对应的文本
    for match in matches[:10]:
        print(match, doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [26]:
# 只需满足其中一个 pattern 就算匹配成功
# 定义speak_lemmas列表，包含"think"和"say"
speak_lemmas = ["think", "say"]
# 从data中提取文本内容，赋值给变量text，并将"`"替换成"'"
text = data[0][2][0].replace( "`", "'")
# 初始化Matcher对象
matcher = Matcher(nlp.vocab)
# 定义匹配模式1，寻找被单引号括起来的字符串，后面跟着speak_lemmas中的动词，然后是一个或多个专有名词，再后面是被单引号括起来的字符串
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
# 定义匹配模式2，寻找被单引号括起来的字符串，后面跟着speak_lemmas中的动词，然后是一个或多个专有名词
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}]
# 定义匹配模式3，寻找一个或多个专有名词，后面跟着speak_lemmas中的动词，再后面是被单引号括起来的字符串
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
# 将模式命名为"PROPER_NOUNS"，并将三个模式添加到matcher对象中，设置greedy为"LONGEST"以优先匹配最长的字符串
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')
# 遍历data[0][2]中的每个文本
for text in data[0][2]:
    # 将文本中的"`"替换成"'"
    text = text.replace("`", "'")
    # 使用nlp对象处理文本
    doc = nlp(text)
    # 使用matcher对象匹配文本
    matches = matcher(doc)
    # 对匹配结果按照起始位置排序
    matches.sort(key = lambda x: x[1])
    # 打印匹配结果的数量
    print (len(matches))
    # 打印前10个匹配结果，包括匹配的span和对应的文本
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0
