## 导入库并配置参数

In [1]:
import jieba
import logging
from tqdm.notebook import tqdm_notebook as tqdm
import os
from math import log
import re
import time

In [2]:
word_dict_path = "./jieba_dict.txt" # jieba的字典
SogouW = "./SogouW/Freq/SogouLabDic.dic" # Sogou字典
sighan05 = "./第二届国际中文分词评测/icwb2-data/" # 语料库
msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8') # 测试集
msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt') # 保存输出结果的空文件
msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8') # 测试集的正确结果
OUTFRMAE = "P:\t%.2f\nR:\t%.2f\nF1:\t%.2f\nOOV-R:\t%.2f\nIV-R:\t%.2f" # 准确性结果的输出格式

In [3]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # 调试模式

## 读取数据

In [8]:
def load_dictionary(dict_file):
    fr = open(dict_file,encoding="utf-8")
    word_dict = {item.strip().split(" ")[0]:item.strip().split(" ")[1] for item in fr}
    return word_dict

word_dict = load_dictionary(word_dict_path) # 读取词典

## 作业
对一元语法分词器的性能在MSR语料库上进行评测

### 基于课上代码整理为类

In [4]:
class wordCut():
    def __init__(self):
        self.f = open("jieba_dict.txt",encoding="utf-8")
        self.FREQ, self.total = self.gen_pfdict(self.f)
        self.re_eng = re.compile('[a-zA-Z0-9]', re.U)   
        logger = logging.getLogger()
        logger.setLevel(logging.INFO) 

    def gen_pfdict(self,f):
        lfreq = {}
        ltotal = 0
        for lineno, line in enumerate(f, 1):
            try:
                line = line.strip()
                word, freq = line.split(' ')[:2]
                freq = int(freq)
                lfreq[word] = freq
                ltotal += freq
                for ch in range(len(word)):
                    wfrag = word[:ch + 1]
                    if wfrag not in lfreq:
                        lfreq[wfrag] = 0 # 不在词典里的前缀，词频设为零，前提是词典已经进行排序
            except ValueError:
                raise ValueError(
                    'invalid dictionary entry in %s at Line %s: %s' % (f.name, lineno, line))
        f.close()
        return lfreq, ltotal
    
    def get_DAG(self,sentence):
        DAG = {}
        N = len(sentence)
        for k in range(N):
            logging.debug("k = %s"%k)
            tmplist = []
            i = k
            frag = sentence[k]
            logging.debug("for 循环 frag: %s"%frag)
            while i < N and frag in self.FREQ:
                logging.debug("\t 进入while...")
                logging.debug("\t i = %s"%i)
                logging.debug("\t FREQ[%s]: %s"%(frag,self.FREQ[frag]))
                if self.FREQ[frag]:
                    tmplist.append(i)
                i += 1
                frag = sentence[k:i + 1]
                logging.debug("\t while 循环 frag: %s"%frag)
                logging.debug("\t tmplist: %s"%tmplist)
            if not tmplist:
                tmplist.append(k)
                logging.debug("\t if 语句 tmplist: %s"%tmplist)
            DAG[k] = tmplist
        return DAG

    def calc(self,sentence, DAG, route):
        N = len(sentence)
        route[N] = (0, 0)
        logtotal = log(self.total)
        for idx in range(N-1, -1, -1):
            logging.debug("\t idx: %s"%idx)
            logging.debug("\t DAG[%s]: %s"% (idx,DAG[idx]))
            tmp = []
            for x in DAG[idx]:
                logging.debug("\t\t x: %s"%x)
                logging.debug("\t\t sentence[idx:x + 1]: %s"%sentence[idx:x + 1])
                # 计算概率值
                prob = log(self.FREQ.get(sentence[idx:x + 1]) or 1) - logtotal
                logging.debug("\t\t porb: %s"%prob)
                logging.debug("\t\t route[%s][0]: %s"%(x+1, route[x+1][0]))
                value = round(prob + route[x + 1][0],2)
                logging.debug("\t\t value: %s"%value)
                tmp.append( (value, x) )
                logging.debug("\t\t tmp: %s"%str(tmp))
            route[idx] = max(tmp)
            logging.debug("\t route[%s]: %s"%(idx,str(route[idx])))
            
    def cut(self,sentence):
        DAG = self.get_DAG(sentence)
        route = {}
        self.calc(sentence, DAG, route)
        x = 0
        N = len(sentence)
        buf = ''
        while x < N:
            y = route[x][1] + 1
            l_word = sentence[x:y]
            logging.debug("\tx: %s"%x)
            logging.debug("\troute[%s][1]: %s"%(x,route[x][1]))
            logging.debug("\ty: %s"%y)
            logging.debug("\tl_word: %s"%l_word)
            # 如果是连续的英文字母或数字进行合并
            if self.re_eng.match(l_word) and len(l_word) == 1:
                buf += l_word
                x = y
            else:
                if buf:
                    yield buf
                    buf = ''
                yield l_word
                x = y
        # 如果句子以连续英文或数字结尾
        if buf:
            yield buf
            buf = ''

### 分词

In [10]:
wordcut = wordCut()
sentence = '希腊的经济结构较特殊。'
"  ".join(wordcut.cut(sentence))

'希腊  的  经济  结构  较  特殊  。'

In [11]:
route = {}
DAG = wordcut.get_DAG(sentence)
wordcut.calc(sentence, DAG, route)
route

{11: (0, 0),
 10: (-17.91, 10),
 9: (-29.48, 9),
 8: (-26.97, 9),
 7: (-34.56, 7),
 6: (-45.83, 6),
 5: (-42.91, 6),
 4: (-53.2, 4),
 3: (-50.03, 4),
 2: (-55.27, 2),
 1: (-66.73, 1),
 0: (-65.33, 1)}

In [5]:
wordcut = wordCut()

with open(msr_test,encoding="utf-8") as test, open(msr_output, 'w', encoding="utf-8") as output:
    start = time.time()
    for line in tqdm(test):
        sentence = line.strip()
        output.write("  ".join(wordcut.cut(sentence)))
        output.write("\n")
    runtime = time.time()-start
print('runtime:',runtime)

0it [00:00, ?it/s]

runtime: 6.150891304016113


### 性能测试

In [6]:
def to_region(segmentation: str) -> list: # 词转区间
    region = []
    start = 0
    for word in re.compile("\\s+").split(segmentation.strip()):
        end = start + len(word)
        region.append((start, end))
        start = end
    return region

def prf(gold: str, pred: str, dic) -> tuple: # 准确率函数
    A_size, B_size, A_cap_B_size, OOV, IV, OOV_R, IV_R = 0, 0, 0, 0, 0, 0, 0
    with open(gold, encoding="utf-8") as gd, open(pred, encoding="utf-8") as pd:
        for g, p in zip(gd, pd):  # 取出答案：g和预测：p
            A, B = set(to_region(g)), set(to_region(p))  # 得到区间
            A_size += len(A)
            B_size += len(B)
            A_cap_B_size += len(A & B)
            text = re.sub("\\s+", "", g)  # 得到原始文本
            for (start, end) in A:
                word = text[start:end]
                if word in dic:
                    IV += 1
                else:
                    OOV += 1

            for (start, end) in A & B:
                word = text[start:end]
                if word in dic:
                    IV_R += 1
                else:
                    OOV_R += 1
    p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100
    return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100

In [7]:
def load_dictionary(dict_file):
    fr = open(dict_file,encoding="utf-8")
    word_list = [item.strip().split("\t")[0] for item in fr]
    return set(word_list)

word_dict = load_dictionary(SogouW)

In [8]:
print(OUTFRMAE % prf(msr_gold, msr_output, word_dict))

P:	81.88
R:	83.24
F1:	82.56
OOV-R:	82.59
IV-R:	83.87


In [10]:
o = open(msr_output, encoding='utf-8')
t = open(msr_gold, encoding='utf-8')

In [13]:
print(o.readline())
print(t.readline())

海运业  雄踞  全球  之  首  ，  按  吨位  计  占  世界  总数  的  １  ７  ％  。

海运  业  雄踞  全球  之  首  ，  按  吨位  计  占  世界  总数  的  １７％  。



## 一元语法模型
感觉不是很复杂，就自己尝试实现了一个简单的帮助自己理解。不过写的时候基本还是参照代码写的。。离完全手写还是有很长一段距离

### 构建前缀词典 prefix_dict

In [12]:
def sample_pfdict(word_dict_path, encoding='utf-8'):
    # 千万注意这个不是对长度为N的句子取N-1个前缀，而是每个组合都可能成为前缀。这里被坑了T-T
    word_dict_file = open(word_dict_path, encoding=encoding)
    prefix_dict = {} # 前缀字典
    original_dict = {} # 原始字典
    freq_total = 0
    for lineno, line in tqdm(enumerate(word_dict_file, 1)):
        word, freq = line.strip().split(' ')[:2]
        freq = int(freq)
        original_dict[word] = freq
        for prefix_idx in range(len(line)):
            prefix = word[:prefix_idx + 1]
            if prefix not in prefix_dict:  # 如果不在前缀词典内，则用0占位
                prefix_dict[prefix] = 0
        else:  # 这里改了一下顺序，把词的前缀放在词前面
            freq_total += freq  # 统计词频
            prefix_dict[word] = freq

    word_dict_file.close()
    re_dict = {  # 这样写的用意是如果有其他想要输出的属性，可以直接加到字典里，通过键来读取
        'prefix_dict': prefix_dict,  # 前缀词典
        'freq_total': freq_total,  # 词频
        'original_dict':original_dict, # 原始字典
#         'freq_lst': sorted(prefix.items(), key=lambda x: x[1], reverse=True),  # 词频降序的列表
    }
    return re_dict

In [44]:
# 读取前缀词典
re_dict = sample_pfdict(word_dict_path)
prefix_dict = re_dict['prefix_dict']
freq_total = re_dict['freq_total']
original_dict = re_dict['original_dict']

0it [00:00, ?it/s]

### 有向无环图 DAG

In [84]:
def sample_DAG(sentence, prefix_dict):
    DAG = {}
    N = len(sentence)
    for start_idx in range(N):
        idx_lst = [] # 位置索引列表
        frag = sentence[start_idx]
        end_idx = start_idx
        while end_idx < N and frag in prefix_dict:
            if prefix_dict[frag]:
                idx_lst.append(end_idx)
            end_idx += 1
            frag = sentence[start_idx:end_idx+1]
        if not idx_lst:
            idx_lst.append(start_idx)
        DAG[start_idx] = idx_lst
    return DAG

In [85]:
sample_DAG("希腊的经济结构较特殊。",prefix_dict)

{0: [0, 1],
 1: [1],
 2: [2],
 3: [3, 4],
 4: [4],
 5: [5, 6],
 6: [6],
 7: [7],
 8: [8, 9],
 9: [9],
 10: [10]}

### 最大概率路径 maxProbRoute

In [23]:
def sample_maxProbRoute(sentence, DAG, prefix_dict, freq_total):
    route = {}
    N = len(sentence)
    route[N] = (0,0) # 作为结尾，防溢出
    logtotal = log(freq_total)
    for start_idx in range(N-1, -1, -1):
        best_prob = float('-inf')
        best_endidx = N-1
        prob_sum = 0
        for end_idx in DAG[start_idx]:
            part_of_sentence = sentence[start_idx:end_idx+1]
            prob = log(prefix_dict.get(part_of_sentence) or 1) - logtotal # log后累乘变累加
            prob_sum = route[end_idx+1][0] + prob # 
            if prob_sum > best_prob:
                best_prob = prob_sum
                best_endidx = end_idx
        else:
            route[start_idx] = (best_prob, best_endidx)
            
    return route

In [24]:
sentence = "去北京大学玩"
test_DAG = sample_DAG(sentence=sentence, original_dict=original_dict)
maxProbRoute = sample_maxProbRoute(sentence=sentence, DAG=test_DAG, freq_total=freq_total, prefix_dict=prefix_dict)

### 进行分词 cut

In [25]:
def sample_cut(sentence, maxProbRoute):
    buf = '' # 缓冲
    re_eng = re.compile('[a-zA-Z0-9]', re.U) # 正则
    N = len(sentence)
    start_idx = 0
    while start_idx < N:
        end_idx = maxProbRoute[start_idx][1] + 1
        sentence_cut = sentence[start_idx:end_idx]
        # 如果是连续的英文字母或数字进行合并
        if re_eng.match(sentence_cut) and len(sentence_cut) == 1:
            buf += sentence_cut
            start_idx = end_idx
        else:
            if buf:
                yield buf
                buf = ''
            yield sentence_cut
            start_idx = end_idx
    # 如果句子以连续英文或数字结尾
    if buf:
        yield buf
        buf = ''

In [89]:
sentence = "希腊的经济结构较特殊。"
test_DAG = sample_DAG(sentence=sentence, prefix_dict=prefix_dict)
maxProbRoute = sample_maxProbRoute(sentence=sentence, DAG=test_DAG, freq_total=freq_total, prefix_dict=prefix_dict)
list(sample_cut(sentence,maxProbRoute))

['希腊', '的', '经济', '结构', '较', '特殊', '。']

### 构建一元语法分词类

In [91]:
class sample_wordCut():
    def __init__(self, word_dict_path):
        self.word_dict_path = word_dict_path
        self.re_dict = self._gen_pfdict(word_dict_path)
        self.prefix_dict = self.re_dict['prefix_dict']
        self.freq_total = self.re_dict['freq_total']
        self.original_dict = self.re_dict['original_dict']
    
    def _gen_pfdict(self, word_dict_path, encoding='utf-8'):
        pfdict = sample_pfdict(word_dict_path, encoding)
        return pfdict
        
    def _get_DAG(self, sentence, original_dict):
#         DAG = get_DAG(sentence)
        DAG = sample_DAG(sentence, self.prefix_dict)
        return DAG
        
    def _get_maxProbRoute(self, sentence, DAG, prefix_dict, freq_total):
        maxProbRoute = sample_maxProbRoute(sentence, DAG, self.prefix_dict, self.freq_total)
        return maxProbRoute
        
    def cut(self, sentence):
        DAG = self._get_DAG(sentence, self.original_dict)
        maxProbRoute = self._get_maxProbRoute(sentence=sentence,
                                              DAG=DAG,
                                              prefix_dict=self.prefix_dict,
                                              freq_total=self.freq_total)
        cut = list(sample_cut(sentence, maxProbRoute))
        return cut

In [92]:
wordcut = sample_wordCut(word_dict_path)
wordcut._get_DAG( "希腊的经济结构较特殊。", original_dict)

0it [00:00, ?it/s]

{0: [0, 1],
 1: [1],
 2: [2],
 3: [3, 4],
 4: [4],
 5: [5, 6],
 6: [6],
 7: [7],
 8: [8, 9],
 9: [9],
 10: [10]}

### 分词

In [93]:
wordcut = sample_wordCut(word_dict_path)

with open(msr_test,encoding="utf-8") as test, open(msr_output, 'w', encoding="utf-8") as output:
    start = time.time()
    for line in tqdm(test):
        sentence = line.strip()
        output.write("  ".join(wordcut.cut(sentence)))
        output.write("\n")
    runtime = time.time()-start
print('runtime:',runtime)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

runtime: 0.5596897602081299


### 性能测试

In [94]:
print(OUTFRMAE % prf(msr_gold, msr_output, word_dict))

P:	81.89
R:	83.24
F1:	82.56
OOV-R:	81.16
IV-R:	83.76
