In [5]:
import json
import os

import pandas as pd
import numpy as np

from keras.models import load_model, Model, Sequential
from keras.optimizers import Nadam
from keras.layers import Embedding, Conv1D, MaxPooling1D
from keras.layers.core import Dense, Activation, Dropout ,Flatten
from keras.layers.recurrent import LSTM
from keras.constraints import maxnorm
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

MAX_SEQUENCE_LENGTH = 60

#read data
D = pd.read_csv('../../data/text_data/train.tsv', sep='\t', header=0)

lines = D['Phrase']
labels = D['Sentiment']

#tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

#split data into train and val
n = len(lines)
X_val = data[:int(n*0.2)]
Y_val = labels[:int(n*0.2)]

X_train = data[int(n*0.2):]
Y_train = labels[int(n*0.2):]

Y_train = to_categorical(Y_train, 5)
Y_val = to_categorical(Y_val, 5)

print(X_train[:5])

if not os.path.exists('weights'):
    os.makedirs('weights')

Found 15288 unique tokens.
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   13   34   26 3397 1808 7850
   360    1    2 1394]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0   34   26 3397 1808 7850
   360    1    2 1394]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0   26 3397 1808 7850
   360    1    2 1394]
 [   0    0    0    0    0    0    0    

In [6]:
with open('dictionary.json') as f:
    dictionary = json.load(f)

model = load_model('word2vec')
embeddings = model.get_weights()[0]

embedding_matrix = np.zeros((len(word_index) + 1, 128))
for word, i in word_index.items():
    idx = dictionary.get(word, None)
    if idx is not None:
        embedding_matrix[i] = embeddings[idx]


  return cls(**config)


In [7]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, 32, input_length=MAX_SEQUENCE_LENGTH)) 
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(5, activation='softmax'))

model.summary()

epochs = 60
batch_size = 32 #32
adam = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath="weights/weights_25.hdf5", verbose=1, save_best_only=True, monitor="val_loss")
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=0, verbose=1, mode='auto', cooldown=0, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=6, verbose=1)

model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=1,
          validation_data=(X_val, Y_val), callbacks=[reduce_lr, checkpointer, early_stopping])

scores = model.evaluate(X_val, Y_val, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 32)            489248    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 30, 32)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 32)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 30, 32)            2080      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 15, 32)            0         
___________________________________________________________