## 基于隐马尔科夫模型的词性标注

In [1]:
import os
PKU98 = "pku98"
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')

In [2]:
from  pyhanlp import *

HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
CRFSegmenter = JClass('com.hankcs.hanlp.model.crf.CRFSegmenter')
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')

def train_hmm_pos(corpus, model):
    tagger = HMMPOSTagger(model)  # 创建词性标注器
    tagger.train(corpus)  # 训练
    return tagger

# 输入分词结果，预测词性
hmm_tagger = train_hmm_pos(PKU199801_TRAIN, FirstOrderHiddenMarkovModel())
print(', '.join(hmm_tagger.tag("他", "的", "希望", "是", "希望", "上学")))  # 预测

# 同时进行分词和词性标注
analyzer = AbstractLexicalAnalyzer(CRFSegmenter(), hmm_tagger)  # 构造词法分析器
print(analyzer.analyze("他的希望是希望上学"))  # 分词+词性标注

# 翻译词性代码
print(analyzer.analyze("他的希望是希望上学").translateLabels())  

r, u, n, v, v, v
他/r 的/u 希望/n 是/v 希望/v 上学/v
他/代词 的/助词 希望/名词 是/动词 希望/动词 上学/动词


## 基于条件随机场的词性标注

In [3]:
POS_MODEL = os.path.join(PKU98, 'pos.bin')

CRFPOSTagger = JClass('com.hankcs.hanlp.model.crf.CRFPOSTagger')

In [4]:
def train_crf_pos(corpus):
    tagger = CRFPOSTagger(None)  # 创建空白标注器
    tagger.train(corpus, POS_MODEL)  # 训练
    return tagger

# 训练模型比较耗时，训练以后可以暂时注释掉
#tagger = train_crf_pos(PKU199801_TRAIN)

In [5]:
crf_tagger = CRFPOSTagger(POS_MODEL) # 加载
print(', '.join(crf_tagger.tag("他", "的", "希望", "是", "希望", "上学")))  # 预测
analyzer = AbstractLexicalAnalyzer(CRFSegmenter(), crf_tagger)  # 构造词法分析器
print(analyzer.analyze("李狗蛋的希望是希望上学"))  # 分词+词性标注

r, u, n, v, v, v
李狗蛋/nr 的/u 希望/n 是/v 希望/v 上学/v


## 准确率测评

In [6]:
PosTagUtil = JClass('com.hankcs.hanlp.dependency.nnparser.util.PosTagUtil')

print("HMM\t%.2f%%: " % (PosTagUtil.evaluate(hmm_tagger, PKU199801_TEST)))
print("CRF\t%.2f%%: " % (PosTagUtil.evaluate(crf_tagger, PKU199801_TEST)))

HMM	44.99%: 
CRF	82.17%: 


## 结巴词性标注

思想：将词性标注看成序列标注问题，与分词同时进行

### 导入概率矩阵

每个状态都是分词标签和词性标签的组合，例如('E', 'n')

In [7]:
# 概率值都是取对数之后的结果
from prob_start import P as start_p # 状态初始概率
from prob_trans import P as trans_p # 状态转移概率
from prob_emit import P as emit_p   # 状态发射概率
from char_state_tab import P as states # 每个字的状态

MIN_FLOAT = -3.14e100
MIN_INF = float("-inf")

### Viterbi算法 

由于隐状态数量巨大，实现的过程中要考虑从一个时刻向另一个时刻转移时尽量筛选出可能的状态

In [8]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]  # tabular
    mem_path = [{}]
    all_states = trans_p.keys()
    for y in states.get(obs[0], all_states):  # 遍历第一个字可能的隐状态
        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
        mem_path[0][y] = ''
    for t in range(1, len(obs)):
        V.append({})
        mem_path.append({})
        # 获取前一个时刻可能的状态
        prev_states = [x for x in mem_path[t - 1].keys() if len(trans_p[x]) > 0]
        
        # 获取前一时刻的状态可以转移到的所有状态
        prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
        
        # 将利用前一时刻的状态计算出的当前时刻的状态，与当前字可能的状态求交集，进一步对可能的状态进行过滤
        obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next

        if not obs_states:
            obs_states = prev_states_expect_next if prev_states_expect_next else all_states

        for y in obs_states:
            prob, state = max((V[t - 1][y0] + trans_p[y0].get(y, MIN_INF) +
                               emit_p[y].get(obs[t], MIN_FLOAT), y0) for y0 in prev_states)
            V[t][y] = prob
            mem_path[t][y] = state

    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
    # if len(last)==0:
    #     print obs
    prob, state = max(last)

    route = [None] * len(obs)
    i = len(obs) - 1
    while i >= 0:
        route[i] = state
        state = mem_path[i][state]
        i -= 1
    return (prob, route)

### 词性标注

In [9]:
def __cut(sentence, char_state_tab_P, start_P, trans_P, emit_P):
    prob, pos_list = viterbi(
        sentence, char_state_tab_P, start_P, trans_P, emit_P)
    begin, nexti = 0, 0

    for i, char in enumerate(sentence):
        pos = pos_list[i][0]
        if pos == 'B':
            begin = i
        elif pos == 'E':
            yield (sentence[begin:i + 1], pos_list[i][1])
            nexti = i + 1
        elif pos == 'S':
            yield (char, pos_list[i][1])
            nexti = i + 1
    if nexti < len(sentence):
        yield (sentence[nexti:], pos_list[nexti][1])

In [10]:
sentence = "扬帆远东做与中国合作的先行"
list(__cut(sentence,  states, start_p, trans_p, emit_p))

[('扬帆', 'nz'),
 ('远东', 'n'),
 ('做', 'v'),
 ('与', 'p'),
 ('中国', 'ns'),
 ('合作', 'vn'),
 ('的', 'uj'),
 ('先行', 'n')]