https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/

In [1]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

Using TensorFlow backend.


In [2]:
 # load a clean dataset
def load_dataset(filename):
	return load(open(filename, 'rb'))

In [39]:
# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')

In [54]:
len(trainLines[0])

535

In [40]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [41]:
# create tokenizer
tokenizer = create_tokenizer(trainLines)

In [47]:
len(tokenizer.word_index)

44276

In [55]:
# calculate the maximum document length
def max_length(lines):
	return max([len(s) for s in lines])

In [56]:
# calculate max document length
length = max_length(trainLines)

In [57]:
length

1380

In [58]:
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)

Max document length: 1380
Vocabulary size: 44277


In [5]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

In [59]:
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)
 

(1800, 1380)


In [69]:
trainX[0:5,:]

array([[   27,    27,    27, ...,     0,     0,     0],
       [   74,  1536,  1426, ...,     0,     0,     0],
       [ 7430,     3, 16201, ...,     0,     0,     0],
       [19619,  1517,   230, ...,     0,     0,     0],
       [  360,  4574,   237, ...,     0,     0,     0]], dtype=int32)

In [77]:
# define the model
def define_model(length, vocab_size):
	# channel 1
	inputs1 = Input(shape=(length,))
	embedding1 = Embedding(vocab_size, 100)(inputs1)
	conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
	drop1 = Dropout(0.5)(conv1)
	pool1 = MaxPooling1D(pool_size=2)(drop1)
	flat1 = Flatten()(pool1)
	# channel 2
	inputs2 = Input(shape=(length,))
	embedding2 = Embedding(vocab_size, 100)(inputs2)
	conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
	drop2 = Dropout(0.5)(conv2)
	pool2 = MaxPooling1D(pool_size=2)(drop2)
	flat2 = Flatten()(pool2)
	# channel 3
	inputs3 = Input(shape=(length,))
	embedding3 = Embedding(vocab_size, 100)(inputs3)
	conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
	drop3 = Dropout(0.5)(conv3)
	pool3 = MaxPooling1D(pool_size=2)(drop3)
	flat3 = Flatten()(pool3)
	# merge
	merged = concatenate([flat1, flat2, flat3])
	# interpretation
	dense1 = Dense(10, activation='relu')(merged)
	outputs = Dense(1, activation='sigmoid')(dense1)
	model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
	# compile
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# summarize
	print(model.summary())
	#plot_model(model, show_shapes=True, to_file='multichannel.png')
	return model

In [78]:
# define model
model = define_model(length, vocab_size)

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 1380, 100)    4427700     input_25[0][0]                   
____________________________________________________________________________________________

In [66]:
import pydot
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], array(trainLabels), epochs=10, batch_size=16)
# save the model
model.save('model.h5')

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 1380, 100)    4427700     input_16[0][0]                   
____________________________________________________________________________________________

In [73]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
 

 
# load datasets
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
 
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)
 
# load the model
model = load_model('model.h5')
 
# evaluate model on training dataset
loss, acc = model.evaluate([trainX,trainX,trainX], array(trainLabels), verbose=0)
print('Train Accuracy: %f' % (acc*100))
 
# evaluate model on test dataset dataset
loss, acc = model.evaluate([testX,testX,testX],array(testLabels), verbose=0)
print('Test Accuracy: %f' % (acc*100))

Max document length: 1380
Vocabulary size: 44277
(1800, 1380) (200, 1380)
Train Accuracy: 100.000000
Test Accuracy: 86.500001
