In [28]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Flatten
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.constraints import maxnorm
from keras.callbacks import TensorBoard
from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelBinarizer

In [2]:
data_train = pd.read_csv('data/train.csv', index_col='id')
data_test = pd.read_csv('data/test.csv', index_col='id')

In [3]:
documents = data_train['text'].tolist() + data_test['text'].tolist()

In [4]:
vocab_size = 2000 # number of words in dictionary
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(documents)

In [5]:
encoded_train_docs = tokenizer.texts_to_sequences(list(data_train['text']))
encoded_test_docs = tokenizer.texts_to_sequences(list(data_test['text']))

Adding additional columns to data.

In [6]:
data_train['tokens'] = pd.Series(encoded_train_docs, index=data_train.index)
data_test['tokens'] = pd.Series(encoded_test_docs, index=data_test.index)

In [7]:
data_train['tokens_len'] = data_train['tokens'].apply(len)
data_test['tokens_len'] = data_test['tokens'].apply(len)

Removing outliers. Dirty for now.

In [8]:
max_length = 70 # see EDA to exploration of this legth
data_train = data_train[data_train['tokens_len'] <= max_length]
data_test = data_test[data_test['tokens_len'] <= max_length]

Unifying all sequences to one length and preparing labels.

In [9]:
padded_docs = pad_sequences(data_train['tokens'], maxlen=max_length, padding='post')

In [10]:
encoder = LabelBinarizer()
labels = encoder.fit_transform(data_train['author'])

In [11]:
print(padded_docs.shape, labels.shape)

(19411, 70) (19411, 3)


Shuffling and spliting dataset to test/validation subsets.

In [12]:
full_dataset = np.concatenate((labels, padded_docs), axis=1)
np.random.shuffle(full_dataset)

Y = full_dataset[:,:3]
X = full_dataset[:,3:]

In [13]:
trainset_num = int(len(X) * 0.8)

X_train = X[:trainset_num]
X_test = X[trainset_num:]

Y_train = Y[:trainset_num]
Y_test = Y[trainset_num:]

Bulding LSTM model.

In [14]:
embeding_to_size = 200
classes_num = 3
print(vocab_size, max_length, embeding_to_size)

2000 70 200


In [29]:
model = Sequential()

model.add(Embedding(vocab_size, embeding_to_size, input_length=max_length))
# model.add(Dropout(0.2))
model.add(LSTM(5, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(classes_num, activation='softmax', kernel_constraint=maxnorm(2.)))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 70, 200)           400000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 5)                 4120      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 18        
Total params: 404,138
Trainable params: 404,138
Non-trainable params: 0
_________________________________________________________________


In [33]:
tb = TensorBoard('logs/tb.log', histogram_freq=1, write_grads=True)
es = EarlyStopping(patience=3)

In [34]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=32, callbacks=[tb, es])

Train on 15528 samples, validate on 3883 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa6713f58d0>

In [35]:
model.save('last_model')

In [25]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=64)

Train on 15528 samples, validate on 3883 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa672ec5860>

In [26]:
model.save('m_77_72_r5_wodo')

In [27]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=64)

Train on 15528 samples, validate on 3883 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa672d46780>