In [1]:
import stanza

# stanza.download('zh-hant')

config = {
    'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use
    'lang': 'zh', # Language code for the language to build the Pipeline in
    'tokenize_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/tokenize/gsd.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
    'pos_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pos/gsd.pt',
    'pos_pretrain_path':'../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pretrain/gsd.pt',
    'lemma_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/lemma/gsd.pt',
    'depparse_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/depparse/gsd.pt',
    'depparse_pretrain_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pretrain/gsd.pt',
}

nlp = stanza.Pipeline(**config) # Initialize the pipeline using a configuration dict

2021-05-20 10:52:33 INFO: "zh" is an alias for "zh-hans"
2021-05-20 10:52:33 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | ../../../K...ize/gsd.pt |
| pos       | ../../../K...pos/gsd.pt |
| lemma     | ../../../K...mma/gsd.pt |
| depparse  | ../../../K...rse/gsd.pt |

2021-05-20 10:52:33 INFO: Use device: gpu
2021-05-20 10:52:33 INFO: Loading: tokenize
2021-05-20 10:52:35 INFO: Loading: pos
2021-05-20 10:52:35 INFO: Loading: lemma
2021-05-20 10:52:35 INFO: Loading: depparse
2021-05-20 10:52:36 INFO: Done loading processors!


# Lemmatization

In [2]:
# doc = nlp("古往今來，能飾演古龍小說人物楚留香的，無一不是娛樂圈公認的美男子，2011年，36歲的張智堯在，楚留香新傳，裡飾演楚留香，依舊帥得讓人無法自拔")
# print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [8]:
doc = nlp("動物園裡有獅子與老虎。 ")
        
        
for sent in doc.sentences:
    for word in sent.words:
        print(f"id: {word.id}", f"word: {word.text}", f"head id: {word.head}", 
              f"head: {sent.words[word.head-1].text if word.head > 0 else 'root'}", f"deprel: {word.deprel}")


# # append data for graph searching
# edges_list.append(str(token))
# dependencies_list.append(token.dep_)

    
# print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

id: 1 word: 動物 head id: 2 head: 園 deprel: compound
id: 2 word: 園 head id: 4 head: 有 deprel: nsubj
id: 3 word: 裡 head id: 2 head: 園 deprel: acl
id: 4 word: 有 head id: 0 head: root deprel: root
id: 5 word: 獅子 head id: 4 head: 有 deprel: obj
id: 6 word: 與 head id: 7 head: 老虎 deprel: cc
id: 7 word: 老虎 head id: 5 head: 獅子 deprel: conj
id: 8 word: 。 head id: 4 head: 有 deprel: punct


In [7]:
# doc = nlp("埃及托勒密王朝在亞歷山大城建成規模巨大的亞歷山大圖書館，古希臘埃拉托色尼首創，地理學，一詞")
doc = nlp("動物園裡有獅子與老虎。 ")

text = []
upos = []

for sent in doc.sentences:
    for word in sent.words:
        text.append(word.text)
        upos.append(word.upos)
    
print(text)
print(upos)

print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

['動物', '園', '裡', '有', '獅子', '與', '老虎', '。']
['NOUN', 'PART', 'ADP', 'VERB', 'NOUN', 'CCONJ', 'NOUN', 'PUNCT']
word: 動物	upos: NOUN	xpos: NN	feats: _
word: 園	upos: PART	xpos: SFN	feats: _
word: 裡	upos: ADP	xpos: IN	feats: _
word: 有	upos: VERB	xpos: VV	feats: _
word: 獅子	upos: NOUN	xpos: NN	feats: _
word: 與	upos: CCONJ	xpos: CC	feats: _
word: 老虎	upos: NOUN	xpos: NN	feats: _
word: 。	upos: PUNCT	xpos: .	feats: _


In [15]:
from zhconv import convert
import requests

query_word = "黃光"

query_word_simplified = convert(query_word, 'zh-hans')

r = requests.get("http://shuyantech.com/api/cnprobase/ment2ent?q=", verify=False)

result_json = r.json()

print(result_json )
print(result_json.keys())
print(result_json["status"])
print(result_json["ret"])

{'status': 'ok', 'ret': ['黄光（八甲镇农办副主任）', '黄光（革命烈士）', '黄光（见义勇为打死人贩的爷爷）', '黄光（海南医学院教授）', '黄光（广东省湛江市住房和城乡建设局局长）', '黄光（光刻工艺）']}
dict_keys(['status', 'ret'])
ok
['黄光（八甲镇农办副主任）', '黄光（革命烈士）', '黄光（见义勇为打死人贩的爷爷）', '黄光（海南医学院教授）', '黄光（广东省湛江市住房和城乡建设局局长）', '黄光（光刻工艺）']
