In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.


In [2]:
# %autoreload
# import rnn

In [3]:
np.random.seed(42)

### Read in data

In [4]:
%%time

filepath = '../data/data-False-3.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 14.3 s, sys: 376 ms, total: 14.7 s
Wall time: 14.7 s


### Limit vocab size, pad sequences, and split data

In [5]:
maxlen = 200      # Max number of tokens in input sequence
frac_drop = 0.2   # Number of tokens to randomly drop from input sequences
topn = 10000      # Keep only the top n words in vocabulary.  None means "all"
test_size = 0.1   # Fraction of samples to keep out of training

In [6]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 840 ms, sys: 32 ms, total: 872 ms
Wall time: 855 ms


In [7]:
%%time

# Drop tokens at random from input sequences.

df['encoded_text'] = utils.dropout_tokens(df['encoded_text'].tolist(), frac_drop=frac_drop)

CPU times: user 29.4 s, sys: 20 ms, total: 29.4 s
Wall time: 29.5 s


In [8]:
%%time

# Pad sequences, using topn as the index for "no token"

X_train = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn, 
                        padding='post', truncating='post')

Y_train = df['encoded_label'].tolist()

CPU times: user 288 ms, sys: 4 ms, total: 292 ms
Wall time: 284 ms


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=test_size)

X_train.shape, X_test.shape

((37725, 200), (4192, 200))

In [10]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [11]:
embed_dim = 64         # embedding dimension for word vecs
gru_dim = 64           # dimension of GRU layers
bidirectional = False   # whether to use bidirectional
dense_dim = 32         # dimensionality of dense layer
dropout = 0.1           # dropout ratio
batch_size = 64         # batch size
validation_split = 0.15 # Fraction of samples to keep out for validation
max_epochs = 20         # maximum number of epochs to run for

In [12]:
# About 43% of articles are conservative, so let's weight samples accordingly

class_weight = {0: 0.57, 1: 0.43}

In [13]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(gru_dim, embed_dim, dense_dim, dropout, bidirectional,
                                               maxlen, topn, test_size, batch_size, frac_drop) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_acc', patience=3)

saving checkpoints to: models/64_64_32_0.1_False_200_10000_0.1_64_0.2_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}.h5


In [14]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, maxlen, dropout, bidirectional)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 64)           640064    
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                24768     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 666,945
Trainable params: 666,945
Non-trainable params: 0
_________________________________________________________________


### Train!

In [15]:
hist = model.fit(X_train, Y_train, epochs=max_epochs, batch_size=batch_size,
                 class_weight=class_weight, validation_split=validation_split, 
                 shuffle=True, callbacks=[model_checkpoint, early_stopping])

Train on 32066 samples, validate on 5659 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


### Simple evaluation metrics on hold-out test set

In [17]:
# Load the best checkpoint

model_name = 'models/64_64_32_0.1_False_200_10000_0.1_64_0.2_002_0.21382_0.80350.h5'
model = load_model(model_name)

In [18]:
# Accuracy

scores = model.evaluate(X_test, Y_test, verbose=0)
benchmark = 100. * len(np.where(np.array(Y_test) == 1)[0]) / len(Y_test)

print('Accuracy:  {:2.1f}%'.format(scores[1] * 100.))
print('Benchmark: {:2.1f}% (guessing all label 1)'.format(benchmark))

Accuracy:  80.2%
Benchmark: 56.7% (guessing all label 1)


In [19]:
# F1 score

Y_pred = model.predict_classes(X_test, verbose=0).squeeze().tolist()
benchmark = np.ones_like(Y_test)

print('f1 score:  {:2.2f}'.format(f1_score(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing all label 1): '.format(f1_score(Y_test, benchmark)))

f1 score:  0.82
Benchmark: 0.72 (guessing all label 1): 


In [20]:
# logloss

Y_pred = model.predict(X_test, verbose=0).squeeze().tolist()
benchmark = 0.57 * np.ones_like(Y_test)

print('Log loss:  {:2.2f}'.format(log_loss(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing 0.57 probability "liberal"): '.format(log_loss(Y_test, benchmark)))

Log loss:  0.42
Benchmark: 0.68 (guessing 0.57 probability "liberal"): 


Definitely better than random! `:)`