In [None]:
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary

file = 'data/data.txt'

EOS = '\n'

sentences = LineSentence(file)

docs = [ws + [EOS] for ws in sentences]

dic = Dictionary(docs)

In [None]:
import json

# Dictionary の内容を JSON で保存
with open('dict.json', 'w', encoding = 'utf8') as f:
    ds = [{'index': k, 'word': v} for k, v in dic.iteritems()]
    json.dump(ds, f, ensure_ascii = False)

In [None]:
from keras.layers import Input, Dense, Embedding, Reshape, Flatten, Dropout, Conv2D, concatenate
from keras.layers.pooling import MaxPooling2D
from keras.models import Model

window_size = 2
embed_size = 64

input = Input(shape = (window_size,))

x = Embedding(len(dic), output_dim = embed_size)(input)
x = Reshape((window_size, embed_size, 1))(x)

conv1 = Conv2D(512, kernel_size = (1, embed_size), activation = 'relu')(x)
conv2 = Conv2D(512, kernel_size = (2, embed_size), activation = 'relu')(x)

pool1 = MaxPooling2D(pool_size = (window_size - 1 + 1, 1))(conv1)
pool2 = MaxPooling2D(pool_size = (window_size - 2 + 1, 1))(conv2)

x = concatenate([pool1, pool2], axis = 1)
x = Flatten()(x)
x = Dropout(0.5)(x)

output = Dense(len(dic), activation = 'softmax')(x)

model = Model(input, output)

model.summary()

print(model.input_shape)

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

In [None]:
def split(ws, size):
    for i in range(len(ws) - size + 1):
        yield tuple(ws[i:i + size])

dw = [t for d in docs for t in split(d, window_size + 1)]

In [None]:
import numpy as np
from keras.utils.np_utils import to_categorical

one_hot = lambda cs: to_categorical(dic.doc2idx([cs]), len(dic))[0]

data = np.array([dic.doc2idx(i[0:-1]) for i in dw])
labels = np.array([one_hot(i[-1]) for i in dw])

In [None]:
print(data[0])
print(labels[0])

In [None]:
epochs = 100
batch_size = 50

hist = model.fit(data, labels, epochs = epochs, batch_size = batch_size)

In [None]:
model.save('cnn-sample.hdf5')

In [None]:
import random

def predict_next_word(ws):
    r = model.predict(np.array([dic.doc2idx(ws)]))
    return dic[np.random.choice(len(r[0]), p = r[0])]

def generate(fst_word, maxlen = 50):
    fs = random.choice([d for d in dw if d[0] == fst_word])
    
    res = list(fs[0:-1])
    
    for _ in range(maxlen):
        ws = res[-window_size:]

        nw = predict_next_word(ws)
        
        if nw == EOS:
            break

        res += [nw]

    return res

In [None]:
for _ in range(5):
    print( ''.join(generate('その')) )
    print('----')