## 一元语法模型

### 前缀词典构建

In [1]:
def gen_pfdict(f):
    lfreq = {}
    ltotal = 0
    for lineno, line in enumerate(f, 1):
        try:
            line = line.strip()
            word, freq = line.split(' ')[:2]
            freq = int(freq)
            lfreq[word] = freq
            ltotal += freq
            for ch in range(len(word)):
                wfrag = word[:ch + 1]
                if wfrag not in lfreq:
                    lfreq[wfrag] = 0 # 不在词典里的前缀，词频设为零，前提是词典已经进行排序
        except ValueError:
            raise ValueError(
                'invalid dictionary entry in %s at Line %s: %s' % (f.name, lineno, line))
    f.close()
    return lfreq, ltotal

### 有向无环图构建

In [2]:
import logging
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s - [line:%(lineno)d] - %(levelname)s: %(message)s')

f = open("jieba_dict.txt",encoding="utf-8")
FREQ, total = gen_pfdict(f)

In [4]:
print(FREQ['玩'])
print(total)
log(4207)-log(60101967)

4207
60101967


-9.567048044164698

In [3]:
def get_DAG(sentence):
    logging.info("生成有向无环图...")
    DAG = {}
    N = len(sentence)
    for k in range(N):
        logging.debug("k = %s"%k)
        tmplist = []
        i = k
        frag = sentence[k]
        logging.debug("for 循环 frag: %s"%frag)
        while i < N and frag in FREQ:
            logging.debug("\t 进入while...")
            logging.debug("\t i = %s"%i)
            logging.debug("\t FREQ[%s]: %s"%(frag,FREQ[frag]))
            if FREQ[frag]:
                tmplist.append(i)
            i += 1
            frag = sentence[k:i + 1]
            logging.debug("\t while 循环 frag: %s"%frag)
            logging.debug("\t tmplist: %s"%tmplist)
        if not tmplist:
            tmplist.append(k)
            logging.debug("\t if 语句 tmplist: %s"%tmplist)
        DAG[k] = tmplist
    return DAG

In [8]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
get_DAG("去北京大学玩")

2021-03-28 00:24:29,613 - [line:2] - INFO: 生成有向无环图...


{0: [0], 1: [1, 2, 4], 2: [2], 3: [3, 4], 4: [4], 5: [5]}

### 最大概率路径计算

In [4]:
from math import log
logger = logging.getLogger()
logger.setLevel(logging.INFO) 

route = {}
sentence = "去北京大学玩"
DAG = get_DAG(sentence)
print(DAG)
def calc(sentence, DAG, route):
    logging.info("计算路径概率...")
    N = len(sentence)
    route[N] = (0, 0)
    logtotal = log(total)
    for idx in range(N-1, -1, -1):
        logging.debug("\t idx: %s"%idx)
        logging.debug("\t DAG[%s]: %s"% (idx,DAG[idx]))
        tmp = []
        for x in DAG[idx]:
            logging.debug("\t\t x: %s"%x)
            logging.debug("\t\t sentence[idx:x + 1]: %s"%sentence[idx:x + 1])
            # 计算概率值
            prob = log(FREQ.get(sentence[idx:x + 1]) or 1) - logtotal
            logging.debug("\t\t porb: %s"%prob)
            logging.debug("\t\t route[%s][0]: %s"%(x+1, route[x+1][0]))
            value = round(prob + route[x + 1][0],2)
            logging.debug("\t\t value: %s"%value)
            tmp.append( (value, x) )
            logging.debug("\t\t tmp: %s"%str(tmp))
        route[idx] = max(tmp)
        logging.debug("\t route[%s]: %s"%(idx,str(route[idx])))

2021-05-13 19:57:05,965 - [line:2] - INFO: 生成有向无环图...


{0: [0], 1: [1, 2, 4], 2: [2], 3: [3, 4], 4: [4], 5: [5]}


In [21]:
DAG

{0: [0, 1],
 1: [1],
 2: [2],
 3: [3, 4],
 4: [4],
 5: [5, 6],
 6: [6],
 7: [7],
 8: [8, 9],
 9: [9],
 10: [10]}

In [22]:
sentence = '希腊的经济结构较特殊'
DAG = get_DAG(sentence)
calc(sentence,DAG,route)
sorted(route.items() ,key=lambda x:x[0], reverse=True)

2021-03-28 00:30:54,274 - [line:2] - INFO: 生成有向无环图...
2021-03-28 00:30:54,275 - [line:10] - INFO: 计算路径概率...


[(11, (0, 0)),
 (10, (0, 0)),
 (9, (-11.57, 9)),
 (8, (-9.06, 9)),
 (7, (-16.65, 7)),
 (6, (-27.92, 6)),
 (5, (-25.0, 6)),
 (4, (-35.29, 4)),
 (3, (-32.12, 4)),
 (2, (-37.36, 2)),
 (1, (-48.82, 1)),
 (0, (-47.42, 1))]

### 获取分词结果

#### 精确模式

试图将句子最精确地切开，适合文本分析

In [8]:
import re
re_eng = re.compile('[a-zA-Z0-9]', re.U)

def __cut_DAG_NO_HMM(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        logging.debug("\tx: %s"%x)
        logging.debug("\troute[%s][1]: %s"%(x,route[x][1]))
        logging.debug("\ty: %s"%y)
        logging.debug("\tl_word: %s"%l_word)
        # 如果是连续的英文字母或数字进行合并
        if re_eng.match(l_word) and len(l_word) == 1:
            buf += l_word
            x = y
        else:
            if buf:
                yield buf
                buf = ''
            yield l_word
            x = y
    # 如果句子以连续英文或数字结尾
    if buf:
        yield buf
        buf = ''
            
list(__cut_DAG_NO_HMM(sentence))

2021-05-13 19:57:40,107 - [line:2] - INFO: 生成有向无环图...
2021-05-13 19:57:40,109 - [line:10] - INFO: 计算路径概率...


['去', '北京大学', '玩']

In [11]:
# wordcut = wordCut()
sentence = '希腊的经济结构较特殊。'
"  ".join(__cut_DAG_NO_HMM(sentence))

2021-05-13 19:58:29,679 - [line:2] - INFO: 生成有向无环图...
2021-05-13 19:58:29,680 - [line:10] - INFO: 计算路径概率...


'希腊  的  经济  结构  较  特殊  。'