In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [52]:
from __future__ import unicode_literals, print_function

import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, GRU, Bidirectional, Activation, Dropout
from keras.layers.core import Dropout
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
%autoreload
import utils

### Read in data

In [2]:
%%time

filepath = '../data/data-False-3.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 33.9 s, sys: 596 ms, total: 34.5 s
Wall time: 34.6 s


### Limit vocab size, pad sequences, and split data

In [20]:
maxlen = 100     # Max number of tokens in input sequence
topn = 10000     # Keep only the top n words in vocabulary.  None means "all"
test_size = 0.1  # Fraction of samples to keep out of training

In [4]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(list(df['encoded_text']), topn)

CPU times: user 1.67 s, sys: 48.7 ms, total: 1.72 s
Wall time: 1.72 s


In [22]:
%%time

# Pad sequences, using topn as the index for "no token"

X_train = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn, 
                        padding='post', truncating='post')

Y_train = df['encoded_label'].tolist()

CPU times: user 367 ms, sys: 61.7 ms, total: 428 ms
Wall time: 422 ms


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=test_size)

X_train.shape, X_test.shape

((37725, 100), (4192, 100))

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [24]:
embed_dim = 64         # embedding dimension for word vecs
gru_dim = 64           # dimension of GRU layers
bidirectional = False   # whether to use bidirectional
dense_dim = 32         # dimensionality of dense layer
dropout = 0.3           # dropout ratio
batch_size = 64         # batch size
validation_split = 0.15 # Fraction of samples to keep out for validation
max_epochs = 20         # maximum number of epochs to run for

In [25]:
# About 43% of articles are conservative, so let's weight samples accordingly

class_weight = {0: 0.57, 1: 0.43}

In [26]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = 'gru-{}_{}_{}_{}_{}_{}_{}_{}'.format(maxlen, topn, embed_dim, gru_dim, bidirectional, dense_dim, 
                                                dropout, batch_size) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_acc', patience=3)

saving checkpoints to: models/gru-100_10000_64_64_False_32_0.3_64_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}.h5


We'll store each layer as a variable so we can easily access it later.

In [27]:
model = Sequential()

emb_layer = Embedding(topn+1, embed_dim, input_length=maxlen)
model.add(emb_layer)

gru_layer = GRU(gru_dim, dropout=dropout, recurrent_dropout=dropout)

if bidirectional:
  gru_layer = Bidirectional(gru_layer)

model.add(gru_layer)
  
model.add(Dropout(dropout))
dense_layer = Dense(dense_dim, activation='relu')
model.add(dense_layer)

model.add(Dropout(dropout))
output_layer = Dense(1, activation='sigmoid')
model.add(output_layer)

In [28]:
# Compile the model

_ = model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 64)           640064    
_________________________________________________________________
gru_4 (GRU)                  (None, 64)                24768     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_8 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
Total params: 666,945
Trainable params: 666,945
Non-trainable params: 0
_________________________________________________________________


### Train!

In [29]:
hist = model.fit(X_train, Y_train, epochs=max_epochs, batch_size=batch_size,
                 class_weight=class_weight, validation_split=validation_split, 
                 shuffle=True, callbacks=[model_checkpoint, early_stopping])

Train on 32066 samples, validate on 5659 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


### Simple evaluation metrics on hold-out test set

In [64]:
# Accuracy

scores = model.evaluate(X_test, Y_test, verbose=0)
benchmark = 100. * len(np.where(np.array(Y_test) == 1)[0]) / len(Y_test)

print('Accuracy:  {:2.1f}%'.format(scores[1] * 100.))
print('Benchmark: {:2.1f}% (guessing all label 1)'.format(benchmark))

Accuracy:  76.0%
Benchmark: 55.5% (guessing all label 1)


In [68]:
# F1 score

Y_pred = model.predict_classes(X_test, verbose=0).squeeze().tolist()
benchmark = np.ones_like(Y_test)

print('f1 score:  {:2.2f}'.format(f1_score(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing all label 1): '.format(f1_score(Y_test, benchmark)))

f1 score:  0.77
Benchmark: 0.71 (guessing all label 1): 


In [70]:
# logloss

Y_pred = model.predict(X_test, verbose=0).squeeze().tolist()
benchmark = 0.57 * np.ones_like(Y_test)

print('Log loss:  {:2.2f}'.format(log_loss(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing all probabilities 0.57): '.format(log_loss(Y_test, benchmark)))

Log loss:  0.55
Benchmark: 0.69 (guessing all probabilities 0.57): 


We're definitely doing better than random! `:)`