In [1]:
import pandas as pd
import re
import requests
import  matplotlib.pyplot  as plt
from collections import Counter
from collections import defaultdict


# wikipedia,  smoothing

## Processing Data

In [2]:
# 运行 python WikiExtractor.py -b 2000M zhwiki-20181101-pages-articles.xml.bz2
# 没有做繁体转简体处理，不会安装opencc的windows版本 :-(

In [3]:
all_wiki_content = open('D://pyproject//git//AI-NLP//data//text//AA//wiki_00',encoding='UTF-8').read()
all_wiki_content = re.sub(r'<[^>]+>','',all_wiki_content) # 去掉 <doc> tag

In [4]:
def tokenize(string): 
    #return ''.join(re.findall('[\w|\d]+', string)) # 不太明白两种写法的区别，w也能匹配数字，但输出结果是有不同
    return ''.join(re.findall('\w+', string))

In [5]:
all_character = tokenize(all_wiki_content)
del all_wiki_content # 释放内存
len(all_character)

380434793

In [6]:
all_character_counts = Counter(all_character)
all_character_counts.most_common()[0:20]

[('的', 9938192),
 ('1', 5740539),
 ('0', 4559519),
 ('年', 4088849),
 ('2', 3705103),
 ('一', 3174566),
 ('在', 3142422),
 ('是', 2800422),
 ('中', 2763222),
 ('9', 2730241),
 ('人', 2610319),
 ('大', 2095073),
 ('有', 2064509),
 ('e', 1885083),
 ('a', 1789303),
 ('3', 1753587),
 ('5', 1721315),
 ('和', 1705550),
 ('為', 1662714),
 ('8', 1646008)]

In [7]:
gram_length = 2
two_gram_counts = {}
for i in range(len(all_character)-gram_length): # 用 for循环省内存
    k = all_character[i:i+gram_length]
    two_gram_counts[k] = (two_gram_counts[k]+ 1) if (k in two_gram_counts.keys()) else 1
    
two_gram_counts = Counter(two_gram_counts)
two_gram_counts.most_common()[0:20]

[('20', 1579014),
 ('19', 1442094),
 ('00', 1225241),
 ('01', 853922),
 ('10', 547006),
 ('年1', 527492),
 ('的一', 469028),
 ('12', 444080),
 ('11', 419457),
 ('0年', 417267),
 ('一个', 400248),
 ('18', 387729),
 ('人口', 349391),
 ('99', 340092),
 ('中国', 328509),
 ('1年', 322136),
 ('公里', 320126),
 ('5年', 318534),
 ('月1', 318147),
 ('er', 316517)]

## Unigram, Good-Turing smoothing

In [8]:
len(all_character_counts)

21491

In [9]:
def get_char_prob_from_counts(counts,k=5): 
    # Good-Turing smoothing
    n = sum(counts.values())
    # 计算nr
    nr = defaultdict(int)
    for i in counts.values():
        nr[i] += 1
    nr[0] = 90000 - len(counts) # 假设汉字共9万个
    # 计算rstar
    rstar = [0]*(k+1) 
    total_decreased = 0
    for r in range(1,k+1,1):
        rstar[r] = (r+1)*nr[r+1]/nr[r]
        total_decreased += (r*nr[r] - rstar[r]*nr[r])
        #print(r,rstar[r])
    rstar[0] = total_decreased / nr[0]
    #print (0,rstar[0])
    def get_prob(char):
        occurence = counts.get(char,0)
        return rstar[occurence]/n if occurence<=k else occurence/n
    return get_prob

get_char_prob = get_char_prob_from_counts(all_character_counts,k=5)

from functools import reduce
from operator import mul
def get_1_gram_string_prob(string):
    return reduce(mul,[get_char_prob(char) for char in string])


In [10]:
pair = """前天晚上吃晚饭的时候
前天晚上吃早饭的时候""".split('\n')

pair2 = """正是一个好看的小猫
真是一个好看的小猫""".split('\n')

pair3 = """我无言以对，简直
我简直无言以对""".split('\n')

pairs = [pair, pair2, pair3]
def get_probability_prefromance(language_model_func, pairs):
    for (p1, p2) in pairs:
        print('*'*18)
        print('\t\t {} with probability {}'.format(p1, language_model_func(tokenize(p1)))) # tokenize去掉','这样的标点
        print('\t\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))

In [11]:
get_probability_prefromance(get_1_gram_string_prob, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 2.3223066267509665e-33
		 前天晚上吃早饭的时候 with probability 4.678562566970852e-33
******************
		 正是一个好看的小猫 with probability 1.1087006396816684e-26
		 真是一个好看的小猫 with probability 3.4663369707956e-27
******************
		 我无言以对，简直 with probability 1.747335364002409e-23
		 我简直无言以对 with probability 1.747335364002409e-23


## 2-gram, Katz back-off smoothing

In [12]:
two_gram_table ={}
for w in two_gram_counts.keys():
    if w[0] not in two_gram_table.keys():
        two_gram_table[w[0]] = {}
    two_gram_table[w[0]][w[1]] = two_gram_counts[w]

In [17]:
def get_2_gram_prob_from_counts(counts,k=5): 
    # Katz smoothing
    n = sum(counts.values())
    # 计算nr
    nr = defaultdict(int)
    for i in counts.values():
        nr[i] += 1

    # 计算dr, for 1<= r<=k
    dr = [1]*(k+1) # dr[0] 不使用
    tmp = (k+1)*nr[k+1]/nr[1]
    for r in range(1,k+1,1):
        rstar = (r+1)*nr[r+1]/nr[r]
        dr[r] = (rstar/r-tmp)/(1-tmp)

    # 计算 two_gram_table，最终存储了所有pair的条件概率
    # 计算 a,回退到unigram的系数
    two_gram_table ={}
    a = {}
    for w in counts.keys():
        if w[0] not in two_gram_table.keys():
            two_gram_table[w[0]] = {}
        two_gram_table[w[0]][w[1]] = counts[w]
    for w0 in two_gram_table.keys():
        n0 = sum(two_gram_table[w0].values())
        for w1 in two_gram_table[w0].keys():
            c = two_gram_table[w0][w1]
            if c > k:
                two_gram_table[w0][w1] = c/n0
            else:
                two_gram_table[w0][w1] = dr[c]*c/n0
        sumkatz = sum(two_gram_table[w0].values())
        sumSeenUnigram = sum(get_char_prob(e) for e in two_gram_table[w0].keys())
        a[w0] = (1-sumkatz)/(1-sumSeenUnigram)
        
     
    def get_prob(word,prev):
        occurence = counts.get(prev+word,0)
        if occurence > 0:
            return two_gram_table[prev][word]
        elif prev == '<s>':
            return get_char_prob(word)
        else:
            return a[prev]*get_char_prob(word)       
    return get_prob

get_2_gram_prob = get_2_gram_prob_from_counts(two_gram_counts,k=5)

def get_2_gram_string_prob(string):
    probList = []
    for i,c in enumerate(string):
        prev = '<s>' if i == 0 else string[i-1]
        probList.append(get_2_gram_prob(c,prev))
    return reduce(mul,probList)


In [18]:
get_probability_prefromance(get_2_gram_string_prob, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 2.8863673714341063e-24
		 前天晚上吃早饭的时候 with probability 6.446398053347142e-25
******************
		 正是一个好看的小猫 with probability 6.4603668266917246e-21
		 真是一个好看的小猫 with probability 9.252304558785208e-22
******************
		 我无言以对，简直 with probability 9.167158643679254e-21
		 我简直无言以对 with probability 7.569514628385958e-22
