In [9]:
import random
import numpy as np
import pandas as pd
import re
import jieba
from collections import Counter

In [20]:
grammer_rules = """
host = 主演 电影 评论
主演 = 吴京 adj | 王宝强 adj | 周润发 adj
adj = 的 | 这部 | 这个 | 这辆
电影 = 杀破狼 | 战狼 | 唐人街探案 | 赌神 |
评论 = 太好了 | 垃圾 | 恶心 | 好吃 | 车 adj | 稳住 | 不错
"""

In [21]:
def create_grammer(gram, line_split='\n', word_split='='):
    grammer = {}
    for line in gram.split(line_split):
        if not line: continue
        key, value = line.split(word_split)
        grammer[''.join(key.split())] = [val.split() for val in value.split('|')]
    return grammer

In [22]:
def generate(sentence, target):
    grammer = create_grammer(sentence)
    if target not in grammer: 
        return target
    sen = [generate(sentence, t) for t in random.choice(grammer[target])]
    return ''.join(sen)

In [23]:
def generate_n(sentence, target, n):
    for i in range(n):
        yield generate(sentence, target)

In [24]:
def deal_txt(string):
    return re.findall('\w+', string)

In [25]:
def my_cut(string):
    return list(jieba.cut(string))

In [26]:
def word_gram(path='word_gram.txt'):
    with open(path, 'r') as f:
        word_gram = [line.strip('\n') for line in f.readlines() if line]
    return Counter(word_gram)
word_gram = word_gram()

In [27]:
def word_2_gram(path='word_2_gram.txt'):
    with open(path, 'r') as f:
        word_2_gram = [line.strip('\n') for line in f.readlines() if line]
    return Counter(word_2_gram)
word_2_gram = word_2_gram()

In [34]:
# 2-gram p(w1|w2)P(w2|w3)
def get_sentence_prod(sentence):
    sent = my_cut(sentence)
    prod = 1
    for i, value in enumerate(sent[:-2]):
        p = (word_2_gram[sent[i] + sent[i+1]]) / (word_gram[sent[i+1]] + 1)
        prod *= p if p else 1/len(word_gram) # 如果 p概率为0 用 1/len(word_gram)代替 是否合适？ 
    return (sentence, prod)

In [35]:
def generate_best(n=5, comments=None, grammer=None, host=None):
    """
    n: 生成句子数
    comments: 评论列表
    grammer：自定义语法
    """
    result = []
    if comments:
        for comm in comments:
            result = [get_sentence_prod(comm) for comm in comments if comm]
    else:
        for i in generate_n(grammer, host, n):
            result.append(get_sentence_prod(i))
    return sorted(result, key=lambda x: x[1], reverse=True)

In [39]:
best_comment = generate_best(grammer=grammer_rules, host='host')

In [40]:
best_comment

[('吴京的不错', 0.0001523168922479841),
 ('王宝强的车这部', 1.5083808746887745e-05),
 ('王宝强这部太好了', 6.083021071584992e-06),
 ('吴京这个唐人街探案稳住', 6.803259474632783e-10),
 ('吴京这辆杀破狼稳住', 1.5495454504751688e-12)]