<a href="https://colab.research.google.com/github/changedi/DPpro/blob/master/HanLP_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# use hanlp

In [None]:
!pip install hanlp[full] -U

## 文本相似
two text similarity using pretrained model

In [None]:
import hanlp

sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
print(sim([
    ['看图猜一电影名', '看图猜电影'],
    ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
    ['北京到上海的动车票', '上海到北京的动车票'],
]))

In [None]:
# use fasttext
import hanlp
import torch

# fasttext is a `torch.nn.Module`. Unless you know how to code in
# PyTorch, otherwise don't bother to use this.
fasttext = hanlp.load(hanlp.pretrained.fasttext.FASTTEXT_WIKI_300_ZH)

vec = fasttext('单词')
print(vec)

print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('词语'), dim=0))
print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('今天'), dim=0))

In [None]:
# use word2vec
import hanlp
import torch

word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
vec = word2vec('先进')
print(vec)

print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))

print('获取语义最相似的词语：')
print(word2vec.most_similar('上海'))
# print(word2vec.most_similar(['上海', '寒冷'])) # batching更快

print('非常寒冷是OOV所以无法获取：')
print(word2vec.most_similar('非常寒冷'))
print('但是在doc2vec模式下OOV也可以进行相似度计算：')
print(word2vec.most_similar('非常寒冷', doc2vec=True))
print('甚至可以处理短文本：')
print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True))

## 分词
tokenize

In [None]:
import hanlp

tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE)
print(tokenizer('商品和服务'))
print(tokenizer(['萨哈夫说，伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。',
                 '上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。',
                 'HanLP支援臺灣正體、香港繁體，具有新詞辨識能力的中文斷詞系統']))

text = 'NLP统计模型没有加规则，聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

dic = {'自定义词典': 'custom_dict', '聪明人': 'smart'}


def split_by_dic(text: str):
    # We use regular expression for the sake of simplicity.
    # However, you should use some trie trees for production
    import re
    p = re.compile('(' + '|'.join(dic.keys()) + ')')
    sents, offset, words = [], 0, []
    for m in p.finditer(text):
        if offset < m.start():
            sents.append(text[offset: m.start()])
            words.append((m.group(), dic[m.group()]))
            offset = m.end()
    if offset < len(text):
        sents.append(text[offset:])
        words.append((None, None))
    flat = []
    for pred, (word, tag) in zip(tokenizer(sents), words):
        flat.extend(pred)
        if word:
            flat.append((word, tag))
    return flat


print(split_by_dic(text))

In [None]:
# use trie
from hanlp_trie.trie import Trie

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
text = 'NLP统计模型没有加规则，聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

trie = Trie()
trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'})


def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for start, end, value in words:
        if pre_start != start:
            sents.append(text[pre_start: start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words


print(split_sents(text, trie))


def merge_parts(parts, offsets, words):
    items = [(i, p) for (i, p) in zip(offsets, parts)]
    items += [(start, [value]) for (start, end, value) in words]
    return [each for x in sorted(items) for each in x[1]]


tokenizer = hanlp.pipeline() \
    .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
    .append(tokenizer, input_key='parts', output_key='tokens') \
    .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')

print(tokenizer(text))

## pos

In [None]:
import hanlp
from hanlp.pretrained.pos import CTB9_POS_ALBERT_BASE

tagger = hanlp.load(CTB9_POS_ALBERT_BASE)
print(tagger.predict(['我', '的', '希望', '是', '希望', '世界', '和平']))
print(tagger.predict([['支持', '批处理', '地', '预测'], ['速度', '更', '快']]))

## use pipeline

In [None]:
import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
tagger = hanlp.load('CTB9_POS_ALBERT_BASE')
syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False)
print(pipeline)

text = '''HanLP是一系列模型与算法组成的自然语言处理工具包，目标是普及自然语言处理在生产环境中的应用。
HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。
内部算法经过工业界和学术界考验，配套书籍《自然语言处理入门》已经出版。
'''

doc = pipeline(text)
print(doc)
# By default the doc is json serializable, it holds true if your pipes output json serializable object too.
# print(json.dumps(doc, ensure_ascii=False, indent=2))

# You can save the config to disk for deploying or sharing.
pipeline.save('zh.json')
# Then load it smoothly.
deployed = hanlp.load('zh.json')
print(deployed)
print(deployed(text))

## AMR(Abstract meaning representation)

In [None]:
import hanlp

parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE)

# For Chinese:
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"]))
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False))

# For English:
print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))
# It's suggested to also feed the lemma for stabler performance.
print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),
              ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))

## MLM(masked language model)

In [None]:
from hanlp.components.lm.mlm import MaskedLanguageModel

mlm = MaskedLanguageModel()
mlm.load('bert-base-chinese')
print(mlm('生活的真谛是[MASK]。'))

# Batching is always faster
print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。']))

## NER

In [None]:
import hanlp
from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
from hanlp.utils.io_util import get_resource

HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)
ner: TaggingNamedEntityRecognition = HanLP['ner/msra']
ner.dict_whitelist = {'午饭后': 'TIME'}
doc = HanLP('2021年测试高血压是138，时间是午饭后2点45，低血压是44', tasks='ner/msra')
doc.pretty_print()
print(doc['ner/msra'])

ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}
HanLP('他在浙江金华出生，他的名字叫金华。', tasks='ner/msra').pretty_print()

# HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH))

# 需要算法基础才能理解，初学者可参考 http://nlp.hankcs.com/book.php
# See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html

## MTL

In [None]:
import hanlp
from hanlp_common.document import Document

# CLOSE是自然语义标注的闭源语料库，BASE是中号模型，ZH中文
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
# 默认执行全部任务
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
# 返回类型Document是dict的子类，打印出来兼容JSON
print(doc)
# 即时可视化，防止换行请最大化窗口，推荐在Jupyter Notebook里调用
doc.pretty_print()
# 指定可视化OntoNotes标准的NER
# doc.pretty_print(ner='ner/ontonotes', pos='pku')