In [None]:
import pandas as pd
import numpy as np
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Conv1D, Dense, Activation, MaxPooling1D, GlobalMaxPooling1D, Embedding, Dropout, BatchNormalization
from keras.models import Model

In [None]:
GLOVE_FILE = '/home/paperspace/Data/Glove/glove.6B.50d.txt'

In [None]:

df = pd.read_csv('hewlett-essay-train.tsv', sep='\t', encoding='latin-1', index_col=0)

In [None]:
X = df[df['essay_set'] == 1]['essay']

y = df[df['essay_set'] == 1]['domain1_score']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X.values)
sequences = tokenizer.texts_to_sequences(X.values)
word_index = tokenizer.word_index

%matplotlib inline
import matplotlib.pyplot as plt

plt.hist(list(map(lambda seq: len(seq), sequences)))
plt.title("Sequence Length Histogram")
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.show()


In [None]:
print ('Set contains {} unique words'.format(len(tokenizer.word_index)))

In [None]:
data = pad_sequences(sequences)

labels = np_utils.to_categorical(np.asarray(y.values))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 783
LABELS_COUNT = 13



In [None]:
def create_default_embedding():
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    
    return Embedding(len(word_index) + 1,
                     EMBEDDING_DIM,
                     weights=[embedding_matrix],
                     embeddings_initializer = 'glorot_uniform', # not using pre-trained embeddings
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=True)

In [None]:
def create_pretrained_embedding(train_embeddings = False):
    import numpy as np
    embeddings_index = {}
    with open(GLOVE_FILE) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector        
    
    
    return Embedding(len(word_index) + 1,
                     EMBEDDING_DIM,
                     weights = [embedding_matrix],
                     input_length = MAX_SEQUENCE_LENGTH,
                     trainable = train_embeddings)

In [None]:
embedding_layer = create_pretrained_embedding(True)

In [None]:


sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(16, 3)(embedded_sequences)
x = BatchNormalization()(x)
x = Activation(activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Dropout(.5)(x)

#x = Conv1D(64, 3)(x)
#x = BatchNormalization()(x)
#x = Activation(activation='relu')(x)
#x = MaxPooling1D(3)(x)
#x = Dropout(.75)(x)

x = Conv1D(16, 3)(x)
x = BatchNormalization()(x)
x = Activation(activation='relu')(x)
x = GlobalMaxPooling1D()(x)  # global max pooling
x = Dropout(.5)(x)
#x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(.5)(x)
preds = Dense(LABELS_COUNT, activation='softmax')(x)


model = Model(sequence_input, preds)

from keras import optimizers

opt = optimizers.SGD(lr = .0001)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# happy learning!
history = model.fit(data, labels, validation_split = .2, epochs=50, batch_size=32)

print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()