In [1]:
import hanlp

In [2]:
tok = hanlp.load("FINE_ELECTRA_SMALL_ZH")
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
con = hanlp.load('CTB9_CON_FULL_TAG_ELECTRA_SMALL')

  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm
                                   

In [47]:
from hanlp_common.document import Document
def merge_pos_into_con(doc:Document):
    flat = isinstance(doc['pos'][0], str)
    if flat:
        doc = Document((k, [v]) for k, v in doc.items())
    for tree, tags in zip(doc['con'], doc['pos']):
        offset = 0
        for subtree in tree.subtrees(lambda t: t.height() == 2):
            tag = subtree.label()
            if tag == '_':
                subtree.set_label(tags[offset])
            offset += 1
    if flat:
        doc = doc.squeeze()
    return doc

def extract_span_start(tokens):
    return [t[1] for t in tokens]

def extract_tokens(tokens):
    return [t[0] for t in tokens]


tok.config.output_spans = False
tok.dict_combine = {"所述"}

In [44]:
nlp = hanlp.pipeline() \
    .append(tok, output_key="tok") \
    .append(pos, input_key='tok', output_key='pos') \
    .append(con, input_key='tok', output_key='con') \
    .append(merge_pos_into_con, input_key='*')

In [38]:
claim = """8.一种半导体结构的形成方法，其特征在于，包括：
提供基底，所述基底上形成有栅极结构，所述栅极结构两侧的基底中形成有源漏掺杂层，所述栅极结构侧部的基底上形成有覆盖所述源漏掺杂层的源漏互连层，且所述源漏互连层与源漏掺杂层电连接；
沿所述基底顶面的法线方向，去除部分厚度的所述源漏互连层，形成位于相邻所述栅极结构之间的凹槽；
在所述凹槽底部形成覆盖所述源漏互连层的保护层；
形成所述保护层后，对所述源漏互连层表面进行还原处理，用于实现去氧化；
在所述还原处理后，在所述凹槽中填充源漏盖帽层。"""

In [48]:
doc = nlp(claim)

In [49]:
doc.pretty_print()

In [50]:
for subtree in doc["con"].subtrees(lambda t: t.label().startswith("NP")):
    leaves = subtree.leaves()
    if leaves[0] == "所述":
        print(subtree, "=>", subtree.leaves())

(NP (ADJP (JJ 所述)) (NP (NN 基底))) => ['所述', '基底']
(NP
  (DNP
    (NP
      (NP (ADJP (JJ 所述)) (NP (NN 栅) (NN 极) (NN 结构)))
      (QP (CD 两))
      (NP (NN 侧)))
    (DEG 的))
  (NP (NN 基底))) => ['所述', '栅', '极', '结构', '两', '侧', '的', '基底']
(NP
  (NP (ADJP (JJ 所述)) (NP (NN 栅) (NN 极) (NN 结构)))
  (QP (CD 两))
  (NP (NN 侧))) => ['所述', '栅', '极', '结构', '两', '侧']
(NP (ADJP (JJ 所述)) (NP (NN 栅) (NN 极) (NN 结构))) => ['所述', '栅', '极', '结构']
(NP
  (DNP
    (NP
      (DP (JJ 所述))
      (NP (NP (NN 栅) (NN 极) (NN 结构)) (NN 侧) (NP (NN 部))))
    (DEG 的))
  (NP (NN 基底))) => ['所述', '栅', '极', '结构', '侧', '部', '的', '基底']
(NP (DP (JJ 所述)) (NP (NP (NN 栅) (NN 极) (NN 结构)) (NN 侧) (NP (NN 部)))) => ['所述', '栅', '极', '结构', '侧', '部']
(NP-OBJ (NP (JJ 所述)) (NP (NP (NN 源) (NN 漏)) (VV 掺杂)) (NP (NN 层))) => ['所述', '源', '漏', '掺杂', '层']
(NP (JJ 所述)) => ['所述']
(NP-SBJ
  (ADJP (JJ 所述))
  (NP (NP (NN 源) (NN 漏)) (VP (ADJP (AD 互)) (NP (VV 连))))
  (NP (NN 层))) => ['所述', '源', '漏', '互', '连', '层']
(NP
  (DNP
    (NP (ADJP (JJ 所述)) (NP (NP (NN 

In [34]:
s = tok("我爱北京天安门")

In [35]:
type(s)

list