In [43]:
import random
from collections import defaultdict
import os

In [44]:
def list_dir_re(dirpath): #文件名称
    fnames = []
    files = os.listdir(dirpath)
    for fname in files:
        fname = os.path.join(dirpath,fname)
        if os.path.isdir(fname):
            fnames.extend(list_dir_re(fname))
        else:
            fnames.append(fname)

    return fnames

In [45]:
class BigramModel:
    def __init__(self, vocabulary, m=1):
        self.vocabulary = vocabulary + ['B', 'E'] # 预设词表，增加开始和结束符号
        self.m = m # 平滑系数（伪计数）
        self.bigram_counts = defaultdict(lambda: defaultdict(int)) # 记录每个词对序列的出现频次
        self.total_counts = defaultdict(int) # 记录每个词的出现频次

    def build_LM(self,data_dir):
        files = list_dir_re(data_dir)
        print(files)
        for fname in files:
            with open(fname, 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = text.split('\n') # 从文件中读取文本并分割成句子
                for sentence in sentences:
                    tokens = ['B'] + list(sentence) + ['E'] # 在每个句子的开头和结尾添加开始和结束符号
                    for i in range(len(tokens) - 1):
                        self.bigram_counts[tokens[i]][tokens[i + 1]] += 1 # 更新词对序列的出现频次
                        self.total_counts[tokens[i]] += 1 # 更新词的出现频次


    def sample_word(self, word):
            if word not in self.total_counts: # 如果词不在词表中，随机返回一个词
                return random.choice(self.vocabulary)
            else:
                words = []
                counts = []
                for next_word in self.vocabulary: # 遍历词表中的所有词
                    count = self.bigram_counts[word][next_word] + self.m # 获取词对序列的出现频次，并加上平滑系数
                    words.append(next_word)
                    counts.append(count)
                return random.choices(words, weights=counts, k=1)[0] # 根据概率分布抽样一个词

    def generate_text(self, length):
        current_word = 'B' # 从开始符号开始生成文本
        print(current_word,end='')
        for i in range(length):
            time.sleep(0.2)
            current_word = self.sample_word(current_word) # 抽样下一个词
            if current_word == 'E': # 如果抽到结束符号，停止生成文本
                print('E',end='')
                break
            print(current_word,end='')

In [46]:
if __name__ == '__main__':
    char_set = list()
    with open('chars.txt',encoding='utf-8') as fread:
        for line in fread:
            char_set.append(line.strip())
    #print(char_set)
    blm = BigramModel(char_set)
    blm.build_LM(data_dir='data')

['data\\baikeContent.txt', 'data\\corpus_10000.txt', 'data\\pku_training_seg.utf8']


In [48]:
blm.generate_text(10)

B的疾娇臂墨管瞀指导班