In [None]:
import pandas as pd

df = pd.read_csv('data/data.tsv', delimiter = '\t', names = ('label', 'sentence'))

words = [ws.split(' ') for ws in df['sentence']]
labels = [v for v in df['label']]

In [None]:
import numpy as np
from gensim.corpora import Dictionary

SPACE = ' '

dic = Dictionary(words + [[SPACE]])

word_maxlen = np.max([len(w) for w in words])

word_maxlen


In [None]:
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, Conv2D, concatenate
from keras.layers.pooling import MaxPooling2D
from keras.models import Model
from keras.layers.embeddings import Embedding

def textcnn(sentence_size, embed_size):
    input = Input(shape = (sentence_size,))
    
    x = Embedding(input_dim = len(dic), output_dim = embed_size, input_length = sentence_size)(input)
    x = Reshape((sentence_size, embed_size, 1))(x)
    
    conv1 = Conv2D(512, kernel_size = (3, embed_size), activation = 'relu')(x)
    conv2 = Conv2D(512, kernel_size = (4, embed_size), activation = 'relu')(x)
    conv3 = Conv2D(512, kernel_size = (5, embed_size), activation = 'relu')(x)
    
    pool1 = MaxPooling2D(pool_size = (sentence_size - 3 + 1, 1))(conv1)
    pool2 = MaxPooling2D(pool_size = (sentence_size - 4 + 1, 1))(conv2)
    pool3 = MaxPooling2D(pool_size = (sentence_size - 5 + 1, 1))(conv3)

    x = concatenate([pool1, pool2, pool3], axis = 1)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    
    output = Dense(units = 1, activation = 'sigmoid')(x)
    
    model = Model(input, output)
    
    model.summary()
    
    return model


In [None]:
model = textcnn(word_maxlen, 256)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])


In [None]:
epoch = 10
batch = 50

padding_words = lambda d, size: dic.doc2idx(d) + (dic.doc2idx([SPACE]) * (size - len(d)))

x = np.array([padding_words(ws, word_maxlen) for ws in words])

hist = model.fit(x, labels, epochs = epoch, batch_size = batch)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(hist.history['loss'])
plt.plot(hist.history['acc'])


In [None]:
for i in range(10):
    print( model.predict(np.array([padding_words(words[i], word_maxlen)])) )