In [1]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/duyeoungryu/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/duyeoungryu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.corpus import movie_reviews

sentences = []
for s in movie_reviews.sents():
    s.insert(0, "SS")
    s.append("SE")
    if len(s) > 4:
        sentences.append(s)

In [3]:
sentences[1]

['SS', 'they', 'get', 'into', 'an', 'accident', '.', 'SE']

In [4]:
from collections import Counter

def calculate_bigram(sentences):
    bigram = {} # 저장
    for s in sentences:
        context = "SS"
        for i, w in enumerate(s[1:]):
            if context not in bigram:
                bigram[context] = Counter()
            if bigram[context][w] == 0: # 해당 컨텍스트에 단어가 없으면 
                bigram[context][w] = 1 # 1추가
            bigram[context][w] += 1 # 나온 횟수 추가
            context = w
            
    # 확률 형태로 바꾸는 작업
    for context in bigram.keys():
        total = sum(bigram[context].values())
        for w in bigram[context]:
            bigram[context][w] /= total
    return bigram
bigram = calculate_bigram(sentences)

In [6]:
bigram['SS'].most_common(10) # 첫단어로 많이 나오는 확률

[('the', 0.11231263830320237),
 ('it', 0.043575076893101194),
 ('i', 0.03379121261464379),
 ('but', 0.02523207103391647),
 ('and', 0.024160438673402642),
 ('he', 0.023269731256871668),
 ('in', 0.023102723616272112),
 ('this', 0.022963550582439148),
 ('there', 0.0180507424881355),
 ('as', 0.013249272820898222)]

In [8]:
bigram['i']['am']

0.017556848228450557

In [13]:
bigram['.'],
# 문장이 끝날 확률
# 인용문
# 괄호안에 있는 설명문


Counter({'SE': 0.9612387969875893,
         "'": 0.0010735373054213634,
         '"': 0.02922949299760894,
         ')': 0.00821418695814831,
         "''": 6.506286699523415e-05,
         ']': 0.0001789228842368939})

In [14]:
def sentence_score(s):
    p = 0.0
    for i in range(len(s) - 1):
        c = s[i]
        w = s[i + 1]
        p += np.log(bigram[c][w] + np.finfo(float).eps) 
        # 너무 작은 값을 계속곱하면 결과가 보기 힘듬 그러므로 log를 취한다음 더하기로 연산
        # eps 는 표현가능한 가장 작은값
    return np.exp(p)


In [15]:
# 이 코퍼스 안에서는 굉장히 큰 확률
# 사용예는 문법이 맞나 안맞나를 판단하는데 사용될수 있다.
test_sentence = ["i", "am", "a", "boy", "."]
sentence_score(test_sentence)

3.288036438066686e-08

In [18]:
def generate_sentence(seed=None):
    if seed is not None:
        np.random.seed(seed)
    c = "SS"
    sentence = []
    while True:
        if c not in bigram:
            break
        words, probs = zip(*[(k, v) for k, v in bigram[c].items()])
        idx = np.argmax(np.random.multinomial(1, probs, (1,)))
        w = words[idx]
        
        if w == "SE":
            break
        elif w in ["i", "ii", "iii"]:
            w2 = w.upper()
        elif w in ["mr", "luc", "i", "robin", "williams", "cindy", "crawford"]:
            w2 = w.title()
        else:
            w2 = w
        
        if c == "SS":
            sentence.append(w2.title())
        elif c in ["`", "\"", "'", "("]:
            sentence.append(w2)
        elif w in ["'", ".", ",", ")", ":", ";", "?"]:
            sentence.append(w2)
        else:
            sentence.append(" " + w2)
            
        c = w
    return "".join(sentence) 

In [None]:
# se가 나올때까지 계속 카테고리분포 모형을 만들어 단어 확률로 다음단어를 구한다.

In [19]:
generate_sentence(82)
# 문법이 안맞는 경우는 bigramd이기에 문법에대한 확인은 어렵다

'Alexandre dumas may suspect he at being can be honest here goes awol, but he trusts affleck - see this documentary.'

In [23]:
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]   # header 제외
    
docs = [row[1] for row in data]

In [24]:
from konlpy.tag import Twitter
tagger = Twitter()

def tokenize(doc):
    return ["SS"] + ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)] + ["SE"]


In [25]:
%%time
sentences = [tokenize(d) for d in docs]

CPU times: user 3min 10s, sys: 1.65 s, total: 3min 12s
Wall time: 3min 1s


In [26]:
bigram = calculate_bigram(sentences)

In [27]:
def korean_most_common(c, n, pos=None):
    if pos is None:
        return bigram[tokenize(c)[0]].most_common(n)
    else:
        return bigram["/".join([c, pos])].most_common(n)
korean_most_common("나", 10)

[('이/Determiner', 0.01753546464438441),
 ('정말/Noun', 0.0164754624638085),
 ('이/Noun', 0.015282202866245896),
 ('진짜/Noun', 0.01355591360073655),
 ('영화/Noun', 0.01299259815620192),
 ('재밌다/Adjective', 0.011248137424739846),
 ('아/Exclamation', 0.011193623026881655),
 ('너무/Noun', 0.010363792748373653),
 ('평점/Noun', 0.00959453402304142),
 ('내/Noun', 0.009328019189068046)]

In [28]:
korean_most_common("의", 10)

[('이/Determiner', 0.01753546464438441),
 ('정말/Noun', 0.0164754624638085),
 ('이/Noun', 0.015282202866245896),
 ('진짜/Noun', 0.01355591360073655),
 ('영화/Noun', 0.01299259815620192),
 ('재밌다/Adjective', 0.011248137424739846),
 ('아/Exclamation', 0.011193623026881655),
 ('너무/Noun', 0.010363792748373653),
 ('평점/Noun', 0.00959453402304142),
 ('내/Noun', 0.009328019189068046)]

In [29]:
korean_most_common(".", 10, "Punctuation")

[('SE', 0.34900406798404404),
 ('영화/Noun', 0.009070682868389525),
 ('이/Noun', 0.007806843165391856),
 ('이/Determiner', 0.006753643412893798),
 ('정말/Noun', 0.006332363511894575),
 ('그리고/Conjunction', 0.006016403586145158),
 ('./Punctuation', 0.005937413604707803),
 ('이렇다/Adjective', 0.005687278663489514),
 ('하지만/Conjunction', 0.004871048855303519),
 ('보다/Verb', 0.004489263945022973)]

In [30]:
def korean_bigram_prob(c, w):
    context = tokenize(c)[1]
    word = tokenize(w)[1]
    return bigram[context][word]
korean_bigram_prob("이", "영화")

0.34717143471714346

In [31]:
def korean_generate_sentence(seed=None, debug=False):
    if seed is not None:
        np.random.seed(seed)
    c = "SS"
    sentence = []
    while True:
        if c not in bigram:
            break
        words, probs = zip(*[(k, v) for k, v in bigram[c].items()])
        idx = np.argmax(np.random.multinomial(1, probs, (1,)))
        w = words[idx]
        
        if w == "SE":
            break            
        
        w2 = w.split("/")[0]
        pos = w.split("/")[1]
        
        if c == "SS":
            sentence.append(w2.title())
        elif c in ["`", "\"", "'", "("]:
            sentence.append(w2)
        elif w2 in ["'", ".", ",", ")", ":", ";", "?"]:
            sentence.append(w2)
        elif pos in ["Josa", "Punctuation", "Suffix"]:
            sentence.append(w2)
        elif w in ["임/Noun", "것/Noun", "는걸/Noun", "릴때/Noun",
                   "되다/Verb", "이다/Verb", "하다/Verb", "이다/Adjective"]:
            sentence.append(w2)
        else:
            sentence.append(" " + w2)
        c = w
        
        if debug:
            print(w)
            
    return "".join(sentence)

In [40]:
korean_generate_sentence(8)

"여성만의 재미 짐 자, 처음부터 취약한 사랑마저 어색하다 짝이 충분하다 영화 보다.' 이 황금 같다 ㅋㅋ 진짜 최악이고 독선적으로 먼저 떠오르다 영화도 뜬금 없다. 계속 계속 졸다.. 가장 감동 감동이라고 생각뿐인 모습, 無 를 곁들의 단편영화 2 부부턴 재밌다"

In [None]:
# 한글은 변환작업이 필요하다.ㅁㅁ