# 확률론적 언어 모형

# 바이그램의 예

In [1]:
import nltk

In [2]:
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.corpus import movie_reviews

sentences = []
for s in movie_reviews.sents():
    s.insert(0, "SS")
    s.append("SE")
    if len(s) > 4:
            sentences.append(s)

In [3]:
sentences[1]

['SS', 'they', 'get', 'into', 'an', 'accident', '.', 'SE']

In [4]:
from collections import Counter

def calculate_bigram(sentences):
    bigram = {}
    for s in sentences:
        context = "SS"
        for i, w in enumerate(s[1:]):
            if context not in bigram:
                bigram[context] = Counter() # count는 value가 숫자인
            if bigram[context][w] == 0:
                bigram[context][w] = 1
            bigram[context][w] += 1
            context = w
    for context in bigram.keys():
        total = sum(bigram[context].values())
        for w in bigram[context]:
            bigram[context][w] /= total
    return bigram

In [5]:
bigram = calculate_bigram(sentences)

In [6]:
bigram["SS"].most_common(10)

[('the', 0.11231263830320237),
 ('it', 0.043575076893101194),
 ('i', 0.03379121261464379),
 ('but', 0.02523207103391647),
 ('and', 0.024160438673402642),
 ('he', 0.023269731256871668),
 ('in', 0.023102723616272112),
 ('this', 0.022963550582439148),
 ('there', 0.0180507424881355),
 ('as', 0.013249272820898222)]

In [7]:
bigram["we"].most_common(10)

[("'", 0.12985751295336787),
 ('are', 0.07674870466321243),
 ('see', 0.059261658031088085),
 ('get', 0.052461139896373056),
 ('have', 0.05116580310880829),
 ('can', 0.0391839378238342),
 ('don', 0.03756476683937824),
 ('know', 0.03432642487046632),
 ('never', 0.01878238341968912),
 ('learn', 0.018458549222797927)]

In [8]:
bigram["i"]["was"]

0.053622421998942356

In [9]:
bigram["i"]["am"]

0.017556848228450557

In [10]:
bigram["i"]["is"]

0.00031729243786356425

In [11]:
bigram["i"]["are"]

0.00021152829190904283

In [12]:
bigram["."]["SE"]

0.9612387969875893

In [13]:
bigram["."]

Counter({'"': 0.02922949299760894,
         "'": 0.0010735373054213634,
         "''": 6.506286699523415e-05,
         ')': 0.00821418695814831,
         'SE': 0.9612387969875893,
         ']': 0.0001789228842368939})

In [16]:
def sentence_score(s):
    p = 0.0
    for i in range(len(s) - 1):
        c = s[i]
        w = s[i + 1]
        p += np.log(bigram[c][w] + np.finfo(float).eps) # log를 취해서 underflow 방지
    return np.exp(p)

In [17]:
test_sentence = ["i", "am", "a", "boy", "."]
sentence_score(test_sentence) # 상대적으로 높은 확률. 있을 수 있는 문장이다.

3.2880364380666861e-08

In [18]:
test_sentence = ["i", "is", "boy", "a" ,"."]
sentence_score(test_sentence)

1.9683389110380156e-38

In [19]:
def generate_sentence(seed=None):
    if seed is not None:
        np.random.seed(seed)
    c = "SS"
    sentence = []
    while True:
        if c not in bigram:
            break
        words, probs = zip(*[(k, v) for k, v in bigram[c].items()])
        idx = np.argmax(np.random.multinomial(1, probs, (1,)))
        w = words[idx]
        
        if w == "SE":
            break
        elif w in ["i", "ii", "iii"]:
            w2 = w.upper()
        elif w in ["mr", "luc", "i", "robin", "williams", "cindy", "crawford"]:
            w2 = w.title() # 맨 앞글자만 대문자로 표시
        else:
            w2 = w
        
        if c == "SS":
            sentence.append(w2.title())
        elif c in ["`", "\"", "'", "("]:
            sentence.append(w2)
        elif w in ["'", ".", ",", ")", ":", ";", "?"]:
            sentence.append(w2)
        else:
            sentence.append(" " + w2)
            
        c = w
    return "".join(sentence)

In [23]:
generate_sentence(12) # bigram 모델이기 때문에 앞에 한단어만 기억하고 문장을 만든다.
                      # 그렇기 때문에 문법적으로 틀린 문장이 만들어진다.
                      # 그래서 RNN이 나옴 (context가 무한대인 모델)



In [29]:
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]   # header 제외
    
docs = [row[1] for row in data]

In [30]:
from konlpy.tag import Twitter
tagger = Twitter()

def tokenize(doc):
    return ["SS"] + ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)] + ["SE"]

In [31]:
%%time
sentences = [tokenize(d) for d in docs]

Wall time: 1min 45s


In [32]:
bigram = calculate_bigram(sentences)

In [34]:
def korean_most_common(c, n, pos=None):
    if pos is None:
        return bigram[tokenize(c)[0]].most_common(n)
    else:
        return bigram["/".join([c, pos])].most_common(n)

In [35]:
korean_most_common("나", 10)

[('이/Determiner', 0.01689056886482437),
 ('정말/Noun', 0.015732747611993666),
 ('이/Noun', 0.014932488804890093),
 ('진짜/Noun', 0.01232739098602101),
 ('영화/Noun', 0.012003882106553608),
 ('재밌다/Adjective', 0.010726873371813864),
 ('아/Exclamation', 0.010250123444177691),
 ('너무/Noun', 0.009841480649060973),
 ('평점/Noun', 0.009109328974476852),
 ('내/Noun', 0.00893906114317822)]

In [36]:
korean_most_common("의", 10)

[('이/Determiner', 0.01689056886482437),
 ('정말/Noun', 0.015732747611993666),
 ('이/Noun', 0.014932488804890093),
 ('진짜/Noun', 0.01232739098602101),
 ('영화/Noun', 0.012003882106553608),
 ('재밌다/Adjective', 0.010726873371813864),
 ('아/Exclamation', 0.010250123444177691),
 ('너무/Noun', 0.009841480649060973),
 ('평점/Noun', 0.009109328974476852),
 ('내/Noun', 0.00893906114317822)]

In [37]:
korean_most_common(".", 10, "Punctuation")

[('SE', 0.32894018276477227),
 ('영화/Noun', 0.009283867914224342),
 ('이/Noun', 0.007099428404995085),
 ('이/Determiner', 0.007063021079841264),
 ('./Punctuation', 0.00651691120253395),
 ('정말/Noun', 0.005788764699457531),
 ('이렇다/Adjective', 0.005788764699457531),
 ('그리고/Conjunction', 0.005097025521534933),
 ('하지만/Conjunction', 0.004842174245458186),
 ('ㅋㅋ/KoreanParticle', 0.004114027742381767)]

In [38]:
def korean_bigram_prob(c, w):
    context = tokenize(c)[1]
    word = tokenize(w)[1]
    return bigram[context][word]

In [39]:
korean_bigram_prob("이", "영화")

0.3496969696969697

In [40]:
korean_bigram_prob("영화", "이")

0.00016109977446031575

In [41]:
def korean_generate_sentence(seed=None, debug=False):
    if seed is not None:
        np.random.seed(seed)
    c = "SS"
    sentence = []
    while True:
        if c not in bigram:
            break
        words, probs = zip(*[(k, v) for k, v in bigram[c].items()])
        idx = np.argmax(np.random.multinomial(1, probs, (1,)))
        w = words[idx]
        
        if w == "SE":
            break            
        
        w2 = w.split("/")[0]
        pos = w.split("/")[1]
        
        if c == "SS":
            sentence.append(w2.title())
        elif c in ["`", "\"", "'", "("]:
            sentence.append(w2)
        elif w2 in ["'", ".", ",", ")", ":", ";", "?"]:
            sentence.append(w2)
        elif pos in ["Josa", "Punctuation", "Suffix"]:
            sentence.append(w2)
        elif w in ["임/Noun", "것/Noun", "는걸/Noun", "릴때/Noun",
                   "되다/Verb", "이다/Verb", "하다/Verb", "이다/Adjective"]:
            sentence.append(w2)
        else:
            sentence.append(" " + w2)
        c = w
        
        if debug:
            print(w)
            
    return "".join(sentence)

In [42]:
korean_generate_sentence(1)

'공항 가다 구석이 너무 재밌다'

In [43]:
korean_generate_sentence(17)

'비디오'

In [44]:
korean_generate_sentence(32)

'다 지네 작업실 문을 마물가 아니다'

In [45]:
korean_generate_sentence(35)

'샬라샬라'

In [46]:
korean_generate_sentence(46)

'이영화를 괜찮다 영화 진하다 여운이 없다 만들다?? ㅋㅋ'

# 단어 임베딩과 word2vec

In [51]:
sentences = [list(s) for s in movie_reviews.sents()]

In [52]:
from gensim.models.word2vec import Word2Vec

In [53]:
model = Word2Vec(sentences)

In [54]:
model.init_sims(replace=True)

In [55]:
model.similarity('actor', 'actress')

  """Entry point for launching an IPython kernel.


0.86710436795965506

In [56]:
model.similarity('he', 'she')

  """Entry point for launching an IPython kernel.


0.86536507863405343

In [57]:
model.similarity('actor', 'she')

  """Entry point for launching an IPython kernel.


0.27616074577323224

In [58]:
model.most_similar("villain")

  """Entry point for launching an IPython kernel.


[('genius', 0.8114882707595825),
 ('bard', 0.7670204639434814),
 ('impression', 0.7658730745315552),
 ('personality', 0.7658467888832092),
 ('droll', 0.7644909620285034),
 ('charming', 0.7610799074172974),
 ('actress', 0.7596417665481567),
 ('dude', 0.7587822079658508),
 ('doctor', 0.7576232552528381),
 ('actor', 0.755042552947998)]

In [60]:
model.most_similar(positive=['she', 'actor'], negative='actress', topn=1)

  """Entry point for launching an IPython kernel.


[('he', 0.333840012550354)]

In [61]:
import codecs

def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

train_data = read_data('ratings_train.txt')

In [62]:
from konlpy.tag import Twitter
tagger = Twitter()

def tokenize(doc):
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]

train_docs = [row[1] for row in train_data]
sentences = [tokenize(d) for d in train_docs]

In [64]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences)
model.init_sims(replace=True)

In [65]:
model.similarity(*tokenize(u'악당 영웅'))

  """Entry point for launching an IPython kernel.


0.77437696027911418

In [66]:
model.similarity(*tokenize(u'악당 감동'))

  """Entry point for launching an IPython kernel.


0.25722207445926837

In [67]:
from konlpy.utils import pprint
pprint(model.most_similar(positive=tokenize(u'여배우 남자'), negative=tokenize(u'배우'), topn=1))

[('여자/Noun', 0.809704065322876)]


  


# CBOW in Keras

In [70]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16545711018486672408
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2302666342
locality {
  bus_id: 1
}
incarnation: 1509918102054816412
physical_device_desc: "device: 0, name: GeForce GTX 970M, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


# RNN 기본 구조와 Keras를 사용한 RNN 구현

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.optimizers import *
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from nltk.tokenize import sent_tokenize
from konlpy.corpus import kolaw
from konlpy.tag import Twitter

Using TensorFlow backend.


In [3]:
c = kolaw.open('constitution.txt').read()

In [4]:
senstents = [s for s in sent_tokenize(c)]

In [5]:
senstents[3]

'제2조 ① 대한민국의 국민이 되는 요건은 법률로 정한다.'

In [6]:
twitter = Twitter()
doc0 = [" ".join(["".join(w) for w, t in twitter.pos(s) if t not in ['Number', "Foreign"] and w not in ["제", "조"]]) for s in sent_tokenize(c)]

In [7]:
len(doc0)

357

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc0)
doc = [l for l in tokenizer.texts_to_sequences(doc0) if len(l) > 1]

In [9]:
len(doc)

354

In [10]:
maxlen = max([len(x) - 1 for x in doc])
vocab_size = len(tokenizer.word_index) + 1

In [11]:
maxlen, vocab_size

(188, 1205)

In [12]:
def generate_data(X, maxlen, V):
    for sentence in X: 
        inputs = []
        targets = []
        for i in range(1, len(sentence)):
            inputs.append(sentence[0:i])
            targets.append(sentence[i])
        y = np_utils.to_categorical(targets, V)
        inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
        yield (inputs_sequence, y)

In [13]:
for i, (x, y) in enumerate(generate_data(doc, maxlen, vocab_size)):
    print("i", i)
    print("x", x.shape, "\n", x)
    print("y", y.shape, "\n", y)
    if i > 1:
        break

i 0
x (188, 188) 
 [[  0   0   0 ...,   0   0 101]
 [  0   0   0 ...,   0 101  24]
 [  0   0   0 ..., 101  24 607]
 ..., 
 [  0   0 101 ..., 155   2  18]
 [  0 101  24 ...,   2  18 176]
 [101  24 607 ...,  18 176   7]]
y (188, 1205) 
 [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
i 1
x (5, 188) 
 [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0 

In [14]:
X = []
Y = []
for x, y in generate_data(doc, maxlen, vocab_size):
    X.append(x)
    Y.append(y)

X = np.concatenate(X)
Y = np.concatenate(Y)

In [15]:
X.shape, Y.shape

((6923, 188), (6923, 1205))

In [16]:
import keras.backend.tensorflow_backend as K

In [17]:
%%time
with K.tf.device('/gpu:0'):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=maxlen))
    model.add(LSTM(100, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=["accuracy"])
    hist = model.fit(X, Y, epochs=500, batch_size=800, verbose=2)

Epoch 1/500


InternalError: Blas GEMM launch failed : a.shape=(800, 100), b.shape=(100, 100), m=800, n=100, k=100
	 [[Node: lstm_1/while/MatMul_6 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/Switch_2:1, lstm_1/while/MatMul_6/Enter)]]
	 [[Node: metrics/acc/Mean/_97 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1860_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'lstm_1/while/MatMul_6', defined at:
  File "C:\Users\user\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\user\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\user\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\user\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\user\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\user\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\user\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\user\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\user\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-17-a7853f16a151>", line 1, in <module>
    get_ipython().run_cell_magic('time', '', 'with K.tf.device(\'/gpu:0\'):\n    model = Sequential()\n    model.add(Embedding(vocab_size, 100, input_length=maxlen))\n    model.add(LSTM(100, return_sequences=False))\n    model.add(Dropout(0.5))\n    model.add(Dense(vocab_size, activation=\'softmax\'))\n    model.compile(loss=\'categorical_crossentropy\', optimizer=RMSprop(), metrics=["accuracy"])\n    hist = model.fit(X, Y, epochs=500, batch_size=800, verbose=2)')
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2131, in run_cell_magic
    result = fn(magic_arg_s, cell)
  File "<decorator-gen-63>", line 2, in time
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\magic.py", line 187, in <lambda>
    call = lambda f, *a, **k: f(*a, **k)
  File "C:\Users\user\Anaconda3\lib\site-packages\IPython\core\magics\execution.py", line 1238, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 4, in <module>
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\models.py", line 492, in add
    output_tensor = layer(self.outputs[0])
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\layers\recurrent.py", line 499, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\engine\topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\layers\recurrent.py", line 2151, in call
    initial_state=initial_state)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\layers\recurrent.py", line 608, in call
    input_length=timesteps)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2767, in rnn
    swap_memory=True)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2934, in while_loop
    result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2720, in BuildLoop
    pred, body, original_loop_vars, loop_vars, shape_invariants)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2662, in _BuildLoop
    body_result = body(*packed_vars_for_body)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2753, in _step
    tuple(constants))
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\layers\recurrent.py", line 599, in step
    return self.cell.call(inputs, states, **kwargs)
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\layers\recurrent.py", line 1945, in call
    self.recurrent_kernel_c))
  File "C:\Users\user\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 1075, in dot
    out = tf.matmul(x, y)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 2022, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2799, in _mat_mul
    name=name)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): Blas GEMM launch failed : a.shape=(800, 100), b.shape=(100, 100), m=800, n=100, k=100
	 [[Node: lstm_1/while/MatMul_6 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/Switch_2:1, lstm_1/while/MatMul_6/Enter)]]
	 [[Node: metrics/acc/Mean/_97 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1860_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [9]:
from keras.models import load_model
model = load_model("rnn_text_gen.hdf5")

In [10]:
word_list = '대한민국 의 국민 이 되는 요건 은 법률 로 정한 다 .'.split(" ")
word_list

['대한민국', '의', '국민', '이', '되는', '요건', '은', '법률', '로', '정한', '다', '.']

In [24]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [27]:
x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:2]]], maxlen=maxlen)
x

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [28]:
p = model.predict(x)[0]
p

array([  5.13145473e-08,   1.11537590e-03,   3.62565304e-04, ...,
         6.63872834e-06,   1.32354771e-07,   1.36414428e-05], dtype=float32)

In [29]:
idx = np.flip(np.argsort(p), 0)
idx

array([ 438,  437,   19, ...,  605, 1170,  286], dtype=int64)

In [30]:
p[idx]

array([  1.76298365e-01,   1.67277753e-01,   5.77470176e-02, ...,
         1.06850362e-10,   7.99032229e-11,   7.51542439e-11], dtype=float32)

In [31]:
for i in idx[:5]:
    print(reverse_word_map[i])

영토
주권
국민
조직
종류


In [32]:
def generate_sentence(i):
    x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:i]]], maxlen=maxlen)
    p = model.predict(x)[0]
    idx = np.flip(np.argsort(p), 0)
    for j in idx[:3]:
        print('"', " ".join(word_list[:i]), '"', reverse_word_map[j])

In [33]:
generate_sentence(1)

" 대한민국 " 의
" 대한민국 " 은
" 대한민국 " 헌법


In [34]:
generate_sentence(2)

" 대한민국 의 " 영토
" 대한민국 의 " 주권
" 대한민국 의 " 국민


# 신경망 언어 번역

# BLEU
 - 단어가 번역되어 나왔을 때 정답인 reference에 있는지 없는지를 측정

In [35]:
from nltk.translate.bleu_score import sentence_bleu

In [36]:
hypothesis = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', # y hat
             'ensures', 'that', 'the', 'military', 'always',
             'obeys', 'the', 'commands', 'of', 'the', 'party']
reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',  # reference는 사람이 낸 답
              'ensures', 'that', 'the', 'military', 'will', 'forever',
              'heed', 'Party', 'commands']
reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
              'guarantees', 'the', 'military', 'forces', 'always',
              'being', 'under', 'the', 'command', 'of', 'the',
              'Party']
reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
              'army', 'always', 'to', 'heed', 'the', 'directions',
              'of', 'the', 'party']

In [37]:
sentence_bleu([reference1, reference2, reference3], hypothesis)

0.5045666840058485

# 베이지안 네트워크

# 몬티 홀 문제

In [19]:
from pgmpy.factors.discrete import TabularCPD

In [20]:
cpd_c = TabularCPD('C', 3, [[0.33, 0.33, 0.33]])
print(cpd_c)

╒═════╤══════╕
│ C_0 │ 0.33 │
├─────┼──────┤
│ C_1 │ 0.33 │
├─────┼──────┤
│ C_2 │ 0.33 │
╘═════╧══════╛


In [21]:
cpd_p = TabularCPD('P', 3, [[0.33, 0.33, 0.33]])
print(cpd_p)

╒═════╤══════╕
│ P_0 │ 0.33 │
├─────┼──────┤
│ P_1 │ 0.33 │
├─────┼──────┤
│ P_2 │ 0.33 │
╘═════╧══════╛


In [22]:
cpd_h = TabularCPD('H', 3, [[0,   0, 0, 0, 0.5, 1, 0, 1, 0.5], 
                            [0.5, 0, 1, 0,   0, 0, 1, 0, 0.5], 
                            [0.5, 1, 0, 1, 0.5, 0, 0, 0, 0  ]],
                   evidence=['C', 'P'], evidence_card=[3, 3])
print(cpd_h)

╒═════╤═════╤═════╤═════╤═════╤═════╤═════╤═════╤═════╤═════╕
│ C   │ C_0 │ C_0 │ C_0 │ C_1 │ C_1 │ C_1 │ C_2 │ C_2 │ C_2 │
├─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┤
│ P   │ P_0 │ P_1 │ P_2 │ P_0 │ P_1 │ P_2 │ P_0 │ P_1 │ P_2 │
├─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┤
│ H_0 │ 0.0 │ 0.0 │ 0.0 │ 0.0 │ 0.5 │ 1.0 │ 0.0 │ 1.0 │ 0.5 │
├─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┤
│ H_1 │ 0.5 │ 0.0 │ 1.0 │ 0.0 │ 0.0 │ 0.0 │ 1.0 │ 0.0 │ 0.5 │
├─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┼─────┤
│ H_2 │ 0.5 │ 1.0 │ 0.0 │ 1.0 │ 0.5 │ 0.0 │ 0.0 │ 0.0 │ 0.0 │
╘═════╧═════╧═════╧═════╧═════╧═════╧═════╧═════╧═════╧═════╛


In [23]:
from pgmpy.models import BayesianModel

model = BayesianModel([('C', 'H'), ('P', 'H')])
model.add_cpds(cpd_c, cpd_p, cpd_h)
model.check_model()

True

In [24]:
import networkx as nx

nx.draw_networkx(model.to_directed())

<IPython.core.display.Javascript object>

In [25]:
model.is_active_trail('C', 'P')

False

In [26]:
model.get_independencies()

(C _|_ P)
(P _|_ C)

In [27]:
model.is_active_trail('C', 'P', observed='H')

True

In [28]:
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)

In [29]:
from pgmpy.factors import factor_product
j = infer.factors['C'][0]
print(j)

╒═════╤══════════╕
│ C   │   phi(C) │
╞═════╪══════════╡
│ C_0 │   0.3300 │
├─────┼──────────┤
│ C_1 │   0.3300 │
├─────┼──────────┤
│ C_2 │   0.3300 │
╘═════╧══════════╛


In [30]:
from pgmpy.factors import factor_product

posteriors = infer.query(['C', 'H'], evidence={'P': 0})
print(posteriors['H'])

╒═════╤══════════╕
│ H   │   phi(H) │
╞═════╪══════════╡
│ H_0 │   0.0000 │
├─────┼──────────┤
│ H_1 │   0.5000 │
├─────┼──────────┤
│ H_2 │   0.5000 │
╘═════╧══════════╛


In [31]:
posterior_c = infer.query(['C'], evidence={'P': 0, 'H': 2})
print(posterior_c['C'])

╒═════╤══════════╕
│ C   │   phi(C) │
╞═════╪══════════╡
│ C_0 │   0.3333 │
├─────┼──────────┤
│ C_1 │   0.6667 │
├─────┼──────────┤
│ C_2 │   0.0000 │
╘═════╧══════════╛


In [32]:
posterior_c = infer.query(['C'], evidence={'P': 1, 'H': 2})
print(posterior_c['C'])

╒═════╤══════════╕
│ C   │   phi(C) │
╞═════╪══════════╡
│ C_0 │   0.6667 │
├─────┼──────────┤
│ C_1 │   0.3333 │
├─────┼──────────┤
│ C_2 │   0.0000 │
╘═════╧══════════╛
