In [2]:
#coding: utf-8
import numpy as np
import keras
from keras.optimizers import *
from keras.layers import *
from keras.callbacks import *
from keras.models import *
from numpy import *
import codecs

In [None]:
def create_model(embed_size=32, max_length=40, filter_sizes=(2, 3, 4, 5), filter_num=64):
    inp = Input(shape=(max_length,))
    emb = Embedding(0xffff, embed_size,embeddings_regularizer=regularizers.l1(0.01))(inp)
    emb_ex = Reshape((max_length, embed_size, 1))(emb)
    convs = []

    for filter_size in filter_sizes:
        conv = Conv2D(filter_num, (filter_size, embed_size), activation="relu")(emb_ex)
        pool = MaxPooling2D(pool_size=(max_length - filter_size + 1, 1))(conv)
        convs.append(pool)

    convs_merged = Concatenate()(convs)
    reshape = Reshape((filter_num * len(filter_sizes),))(convs_merged)
    fc1 = Dense(32, activation="relu")(reshape)
    bn1 = BatchNormalization()(fc1)
    fc2 = Dense(1, activation='sigmoid')(bn1)
    model = Model(input=inp, output=fc2)
    return model

In [None]:
create_model().summary()

In [None]:
def load_data(filepath, targets, max_length=20, min_length=1):
    titles = []
    tmp_comments = []
    with codecs.open(filepath, 'r', 'utf-8', 'ignore') as f:
        for l in f:
            label_id, title = l.split("\t", 1)

            if label_id != "0" and label_id!="1":
                continue
            
            title = [ord(x) for x in title]
            # 長い部分は打ち切り
            title = title[:max_length]
            title_len = len(title)

            if title_len < max_length:

                title += ([0] * (max_length - title_len))

            titles.append((int(label_id), title))

    return titles

In [None]:
def train(inputs, targets, batch_size=1000, epoch_count=1, max_length=20, model_filepath="model.h5", learning_rate=0.0005):

    
    start = learning_rate
    stop = learning_rate * 0.001
    learning_rates = np.linspace(start, stop, epoch_count)

    # モデル作成
    model = create_model(max_length=max_length)

    optimizer = Adam(lr=learning_rate)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    #es_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')

    
    # 学習
    history=model.fit(inputs, targets,
              nb_epoch=epoch_count,
              batch_size=batch_size,
              verbose=1,
              validation_split=0.2,
              shuffle=True,
              callbacks=[
                  LearningRateScheduler(lambda epoch: learning_rates[epoch]),
              ])

    # モデルの保存
    model.save(model_filepath)
    return history


if __name__ == "__main__":
    comments = load_data("final-data.txt",[0])
    np.random.shuffle(comments) # shuffle the data (note: validation_split does not shuffle the data before the splitting)

    input_values = []
    target_values = []
    for target_value, input_value in comments:
        input_values.append(input_value)
        target_values.append(target_value)
    input_values = np.array(input_values)
    target_values = np.array(target_values)
    history=train(input_values, target_values, epoch_count=500)


In [None]:
import pylab as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.ylim([0.8,1.01])
plt.show()

In [None]:
import pylab as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.ylim([0,2])
plt.show()

In [3]:
def predict(comments, model_filepath="model.h5"):
    model = load_model(model_filepath)
    ret = model.predict(comments)
    return ret

if __name__ == "__main__":
    raw_comment = "銀河ヒッチハイク・ガイド"
    comment = [ord(x) for x in raw_comment]
    comment = comment[:20]
    if len(comment) < 20:
        comment += ([0] * (20 - len(comment)))
    ret = predict(np.array([comment]))
    predict_result = ret[0][0]
    if predict_result > 0.5:
        print("ラノベっぽくない:", 100-predict_result * 100,"%")
    else:
        print("ラノベっぽい:", 100-predict_result * 100,"%")

ラノベっぽくない: 1.44072175026 %
