In [1]:
# import python package
import re
from collections import Counter
import jieba

In [2]:
# 只保留中文字
def prepocess_doc(line):
    
    chinese = r'[\u4E00-\u9FFF]+'
    segments = re.findall(chinese, line)

    return segments

In [3]:
# 例子
prepocess_doc('"今天"天氣真的不錯，我也相當的帥氣')  

['今天', '天氣真的不錯', '我也相當的帥氣']

In [4]:
segments_jieba = []
with open('./wiki_zh_small.txt') as fr:
    for line in fr.readlines():
        segments_jieba += prepocess_doc(line)

In [5]:
list(jieba.cut_for_search(segments_jieba[1]))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.553 seconds.
Prefix dict has been built successfully.


['英語', '英語']

In [6]:
cut_segments = []
for seg in segments_jieba:
    cut_segments += list(jieba.cut_for_search(seg))

In [7]:
cut_segments[:5]

['英語', '英語', '英語', '又', '稱']

In [8]:
# 此class 主要用於紀錄只有1個字的counter, 只有2個字的counter等...
class etWordCounters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]

    def generate_gram(self, segments):
        # 若 n=1->unigram n=2-> bigram
        for i in range(1, 1 + self.n):
            for segment in segments:
                self.counters[i] += Counter(self._skip(segment, i))
        # 計算總共有幾個字        
        base_count = sum(dict(self.counters[1]).values())
        self.counters[0] = Counter({'eating': base_count})

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i : i + shift + 1]

In [9]:
# 建造一個n-gram的class 計算下個詞的機率應為多少
class Ngram:
    def __init__(self, n: int, counters: list):
        """ 
        n: n-gram's n
        counters: etWordCounters object
        """
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_next_word(self, prefix: str = '', top_k: int = 5):
        """
        explain: 
        if prefix is empty string, using 1-gram predict next word
        elif prefix is one word use, using 2-gram
        else using 3-gram
        """
        # 表示前無詞可預測         
        if self.n <= 1:
            prefix = 'eating'
        else:
            prefix = prefix[-(self.n - 1):] # 取得str後面個字
        
        count_prefix = self.minor_counter[prefix]
        probs = []
        # get word and probablity         
        for key, count in dict(self.major_counter).items():
            # transfer eating to ''
            prefix = '' if prefix == 'eating' else prefix
            if key.startswith(prefix):
                prob = count / count_prefix
                probs.append((prob, key[-1]))
                
        sorted_probs = sorted(probs, reverse=True)
        
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_word_dict(self, prefix=''):
        """
        explain: 
        get predict_next_word result and transfer to dict
        """
        return {word: prob for prob, word in self.predict_next_word(prefix, top_k=-1)}

In [11]:
counters = etWordCounters(n=5)
counters.generate_gram(cut_segments)

In [12]:
# 全部有多少字
counters[0]

Counter({'eating': 392780})

In [14]:
# 1個字的個數
counters[1]

Counter({'英': 439,
         '語': 1416,
         '又': 210,
         '稱': 724,
         '爲': 3637,
         '文': 1762,
         '是': 3606,
         '一': 4189,
         '種': 891,
         '西': 1535,
         '日': 1109,
         '耳': 51,
         '曼': 101,
         '言': 629,
         '誕': 22,
         '生': 1211,
         '於': 2022,
         '中': 4221,
         '世': 914,
         '紀': 320,
         '早': 213,
         '期': 836,
         '的': 13274,
         '格': 323,
         '蘭': 213,
         '如': 810,
         '今': 331,
         '具': 324,
         '有': 3303,
         '全': 1024,
         '球': 290,
         '通': 708,
         '用': 1995,
         '地': 1835,
         '位': 792,
         '詞': 410,
         '源': 365,
         '遷': 89,
         '居': 291,
         '部': 1393,
         '落': 76,
         '盎': 16,
         '魯': 112,
         '而': 1066,
         '得': 629,
         '名': 889,
         '臨': 86,
         '波': 115,
         '羅': 304,
         '海': 954,
         '半': 244,
         '島': 341,


In [15]:
# 5個字的個數
counters[5]

Counter({'盎格魯撒克': 1,
         '格魯撒克遜': 1,
         '盎格魯族東': 1,
         '北日爾曼語': 2,
         '西日爾曼語': 1,
         '以古諾斯語': 1,
         '文化大革命': 12,
         '孫武的後代': 1,
         '之戰和馬陵': 1,
         '衛武公的後': 1,
         '武公的後代': 1,
         '斷足爾能行': 1,
         '第八十七回': 1,
         '第八十八回': 1,
         '楚雄彝族自': 1,
         '雄彝族自治': 1,
         '彝族自治州': 1,
         '內萬林納獎': 1,
         '反托拉斯法': 1,
         '新墨西哥州': 3,
         '貿易委員會': 1,
         '定義其產品': 1,
         '機業務部門': 1,
         '克里斯蒂娜': 3,
         '斯特拉斯堡': 2,
         '魏瑪共濟會': 1,
         '則有羅殿國': 1,
         '屬永寧州劃': 1,
         '喀斯特地貌': 1,
         '古銀杏之鄉': 1,
         '遵義醫學院': 1,
         '灣大學國立': 1,
         '名中華民國': 1,
         '中央研究院': 2,
         '依中華民國': 1,
         '教育工作者': 1,
         '從羅斯福路': 1,
         '先後由大學': 1,
         '轉交國家級': 1,
         '採用羅馬式': 1,
         '將中華民國': 1,
         '河北省人民': 3,
         '北省人民政': 3,
         '省人民政府': 7,
         '東南亞產生': 1,
         '山西人民出': 1,
         '西人民出版': 1,
         '人民

In [16]:
ngrams = [Ngram(i, counters) for i in range(1, 6)]
ngrams

[<__main__.Ngram at 0x7f4932edfc18>,
 <__main__.Ngram at 0x7f4932edf908>,
 <__main__.Ngram at 0x7f4932edfc88>,
 <__main__.Ngram at 0x7f4932edfd68>,
 <__main__.Ngram at 0x7f4932edfc50>]

In [17]:
# 用 Smoothing of Language Models 來避免下一個詞機率是0的情況
class etWordPredictor:
    def __init__(self, ngrams):
        # set ngram model        
        self.ngrams = ngrams
    
    def predict_proba(self, prefix='', top_k=5):
        # 這個為lamda的值         
        lambdaA = 0.99
        # 看前面有幾個字就用到幾個+1的ngram         
        proba_dicts = [ngram.get_word_dict(prefix) for ngram in ngrams[:len(prefix)+1]]
        probas = []
        # 將字都取出來         
        words = proba_dicts[0].keys()
        # 計算 interpolation 機率  這個機率會是 a*3gram + (1-a)*(a*2gram + (1-a)*1-gram) ... 
        for word in words:
            proba = self.interpolation_proba(word, proba_dicts, lambdaA)
            probas.append((proba, word))

        res_probas = sorted(probas, reverse=True)

        return res_probas[:top_k] if top_k > 0 else res_probas
    
    def interpolation_proba(self, word, proba_dicts, interp_lambda, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[0].get(word, 0.)
        return interp_lambda * proba_dicts[idx].get(word, 0.) + (1 - interp_lambda) * self.interpolation_proba(word, proba_dicts, interp_lambda, idx=idx-1)

In [18]:
etModel = etWordPredictor(ngrams)

In [21]:
probs = etModel.predict_proba('潑水', top_k=10)
probs

[(0.001305674249880677, '平'),
 (0.0007798368350932133, '量'),
 (0.0001995195448169125, '準'),
 (0.00019947651818622865, '庫'),
 (0.0001814971527276127, '系'),
 (0.00018132428241853383, '稻'),
 (0.00016331996660603017, '利'),
 (0.0001453464568427144, '合'),
 (0.00014527135118560356, '產'),
 (0.00012701600426153622, '源')]