In [1]:
import stanza

# stanza.download('zh-hant')

config = {
    'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use
    'lang': 'zh', # Language code for the language to build the Pipeline in
    'tokenize_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/tokenize/gsd.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
    'pos_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pos/gsd.pt',
    'pos_pretrain_path':'../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pretrain/gsd.pt',
    'lemma_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/lemma/gsd.pt',
    'depparse_model_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/depparse/gsd.pt',
    'depparse_pretrain_path': '../../../KnowledgeGraph_materials/stanza_resources/zh-hant/pretrain/gsd.pt',
}

nlp = stanza.Pipeline(**config) # Initialize the pipeline using a configuration dict

2021-05-20 10:52:33 INFO: "zh" is an alias for "zh-hans"
2021-05-20 10:52:33 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package                 |
---------------------------------------
| tokenize  | ../../../K...ize/gsd.pt |
| pos       | ../../../K...pos/gsd.pt |
| lemma     | ../../../K...mma/gsd.pt |
| depparse  | ../../../K...rse/gsd.pt |

2021-05-20 10:52:33 INFO: Use device: gpu
2021-05-20 10:52:33 INFO: Loading: tokenize
2021-05-20 10:52:35 INFO: Loading: pos
2021-05-20 10:52:35 INFO: Loading: lemma
2021-05-20 10:52:35 INFO: Loading: depparse
2021-05-20 10:52:36 INFO: Done loading processors!


# Lemmatization

In [2]:
# doc = nlp("古往今來，能飾演古龍小說人物楚留香的，無一不是娛樂圈公認的美男子，2011年，36歲的張智堯在，楚留香新傳，裡飾演楚留香，依舊帥得讓人無法自拔")
# print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [29]:
doc = nlp("卡斯楚領導古巴革命，並宣布成為社會主義國家")
        
        
for sent in doc.sentences:
    for word in sent.words:
        print(f"id: {word.id}", f"word: {word.text}", f"head id: {word.head}", 
              f"head: {sent.words[word.head-1].text if word.head > 0 else 'root'}", f"deprel: {word.deprel}")


# # append data for graph searching
# edges_list.append(str(token))
# dependencies_list.append(token.dep_)

    
# print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

id: 1 word: 卡斯楚 head id: 7 head: 宣布 deprel: nsubj
id: 2 word: 領導 head id: 7 head: 宣布 deprel: advcl
id: 3 word: 古巴 head id: 4 head: 革命 deprel: nmod
id: 4 word: 革命 head id: 2 head: 領導 deprel: obj
id: 5 word: ， head id: 7 head: 宣布 deprel: punct
id: 6 word: 並 head id: 7 head: 宣布 deprel: mark
id: 7 word: 宣布 head id: 0 head: root deprel: root
id: 8 word: 成 head id: 7 head: 宣布 deprel: xcomp
id: 9 word: 為 head id: 8 head: 成 deprel: mark
id: 10 word: 社會 head id: 12 head: 國家 deprel: nmod
id: 11 word: 主義 head id: 12 head: 國家 deprel: nmod
id: 12 word: 國家 head id: 8 head: 成 deprel: obj


In [27]:
# doc = nlp("埃及托勒密王朝在亞歷山大城建成規模巨大的亞歷山大圖書館，古希臘埃拉托色尼首創，地理學，一詞")
doc = nlp("卡斯楚領導古巴革命，並宣布成為社會主義國家")

text = []
upos = []

for sent in doc.sentences:
    for word in sent.words:
        text.append(word.text)
        upos.append(word.upos)
    
print(text)
print(upos)

print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

['卡斯楚', '領導', '古巴', '革命', '，', '並', '宣布', '成', '為', '社會', '主義', '國家']
['PROPN', 'VERB', 'PROPN', 'NOUN', 'PUNCT', 'ADV', 'VERB', 'VERB', 'VERB', 'NOUN', 'NOUN', 'NOUN']
word: 卡斯楚	upos: PROPN	xpos: NNP	feats: _
word: 領導	upos: VERB	xpos: VV	feats: _
word: 古巴	upos: PROPN	xpos: NNP	feats: _
word: 革命	upos: NOUN	xpos: NN	feats: _
word: ，	upos: PUNCT	xpos: ,	feats: _
word: 並	upos: ADV	xpos: RB	feats: _
word: 宣布	upos: VERB	xpos: VV	feats: _
word: 成	upos: VERB	xpos: VV	feats: _
word: 為	upos: VERB	xpos: VC	feats: _
word: 社會	upos: NOUN	xpos: NN	feats: _
word: 主義	upos: NOUN	xpos: NN	feats: _
word: 國家	upos: NOUN	xpos: NN	feats: _


In [21]:
from zhconv import convert
import requests

query_word = "食用"

query_word_simplified = convert(query_word, 'zh-hans')

r = requests.get("http://shuyantech.com/api/cnprobase/ment2ent?q=" + query_word_simplified, verify=False)

result_json = r.json()

print(result_json )
print(result_json.keys())
print(result_json["status"] == "ok")
print(result_json["ret"])

{'status': 'ok', 'ret': ['食用']}
dict_keys(['status', 'ret'])
True
['食用']
