In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

import numpy as np
import pandas as pd

# set parameters:
max_features = 5000
batch_size = 32
embedding_dims = 20
filters = 250
kernel_size = 3
hidden_dims = 250

train = pd.read_csv('train.csv')
print(train.head())
print("training data shape:", train.shape) # three columns are id, text, author

from keras.utils import to_categorical
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
labels = np.array([a2c[a] for a in train.author])
labels = to_categorical(labels)
print("train labels: ", labels)

text = train["text"]
print("train text shape: ", text.shape)

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters="", lower=True, split=" ", char_level=False)
tokenizer.fit_on_texts(text);
docs = tokenizer.texts_to_sequences(text);
print("text: ", text[0])
print("tokenized: ", docs[0])


maxlen = np.amax([len(x) for x in docs], axis=0)
print("max doc length: ", maxlen)

print('Pad sequences (samples x time)')
docs = sequence.pad_sequences(docs, maxlen=maxlen)
print('padded docs shape:', docs.shape)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size=0.2)

In [None]:
model = Sequential()
input_dim = np.max(docs) + 1
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(input_dim,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))



model.add(Dense(3, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


from keras.callbacks import EarlyStopping
model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=10,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])