In [1]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/frhyme/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /Users/frhyme/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [135]:
import numpy as np 
from nltk.corpus import movie_reviews
from collections import Counter
"""
- sentence의 앞에는 SS(setence start), 뒤에는 SE(sentence end)를 넣어준다. 
- 단어의 수가 4개 이상인 경우에 대해서만 고려
"""
sentences = map(lambda w_lst: ['SS']+w_lst+['SE'], movie_reviews.sents())
sentences = filter(lambda w_lst: True if len(w_lst) >= 5 else False, sentences)
sentences = list(sentences)

def calculate_bigram(sentence_lst):
    """
    - bigram을 만들고, 단어가 나왔을 때, 그 다음에 어떤 단어가 나올 condition prob을 만든다
    - Counter말고 dictionary로 만들어도 되는데, most_common이 있기 때문에 Counter을 사용함
    - 만약 tri-gram을 만들 경우에는 nested Counter가 3개가 나오게 된다. k1, k2, k3
    """
    bigram_dict = Counter()
    for w_lst in sentence_lst:
        bigram_in_s = [(w_lst[i], w_lst[i+1]) for i in range(0, len(w_lst)-1)]
        for bigram in bigram_in_s:
            w1, w2 = bigram
            if w1 not in bigram_dict.keys():
                bigram_dict[w1] = Counter({w2:1})
            else:
                if w2 not in bigram_dict[w1].keys():
                    bigram_dict[w1][w2] = 1
                else:
                    bigram_dict[w1][w2] += 1
    for context in bigram_dict.keys():
        sum_v_in_context = sum(bigram_dict[context].values())
        for k in bigram_dict[context].keys():
            bigram_dict[context][k] /= sum_v_in_context
    return bigram_dict
def sentence_score(w_lst):
    """
    - 그냥 p를 모두 곱해진 것과 값은 같음
    - np.log로 변환하고, 더해준다음, np.exp로 하는게 더 좋음 
    """
    bigram_prob = np.array([bigram[w_lst[i]][w_lst[i+1]] for i in range(len(w_lst) - 1)])
    bigram_prob = np.log(bigram_prob).sum()
    return np.exp(bigram_prob)
def generate_sentence(seed=None):
    if seed is not None:
        np.random.seed(seed)
    context = "SS"
    sentence = []
    while context in bigram.keys():
        """
        - random하게 적합한 w를 확률에 따라서 생성한다. 
        - np.random.choice(words, probs)와 방식이 같은데, np.random.choice의 경우 크기가 32가 넘어가면 안됨
        """
        words, probs = list(bigram[context].keys()), list(bigram[context].values())
        idx = np.argmax( np.random.multinomial(n=1, pvals=probs) )
        w = words[idx]
        if w == "SE": # 선정한 word가 문장의 끝이라는 말이므로, 종결함 
            break
        else:
            # 결과로 나온 문장을 보면 특수문자도 많고, 문제가 많은데, 이러한 부분은 여기서 if를 사용해서 조절해주는 것이 필요함. 
            # 나는 귀찮아서 하지 않았습니다. 하하하하핫
            sentence.append(' '+w)
        context = w # context update 
    return "".join(sentence+['.']).strip().capitalize()

### main function 
bigram = calculate_bigram(sentences) # bigram의 확률을 학습하여 정리
print("--------------------")
for i in range(0, 5):
    print("sentence {}:".format(i).upper())
    print(generate_sentence())
    print("--------------------")

--------------------
SENT 0:
It just as in his home , cole sear ( occupying gabriel byrnes ( especially the first seem forced , and he ' randy newman ( sid james cameron would drive - blair witch ..
--------------------
SENT 1:
The legendary cinematographer stephen king ..
--------------------
SENT 2:
The people in aladdin -- john carpenter ' s dreamcatcher ..
--------------------
SENT 3:
There is the joker , only has something far more importantly , who maintains its water " and william " does ..
--------------------
SENT 4:
The forest , and writes a priest best way to put a little differently , drab picture is ready to sign the right combination ..
--------------------
