In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.


In [13]:
%autoreload
import rnn

In [14]:
np.random.seed(42)

### Read in data

In [3]:
%%time

#filepath = '../data/data-False-3.pkl'
filepath = '../data/data-True-0.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.54 s, sys: 36 ms, total: 1.58 s
Wall time: 1.58 s


### Limit vocab size, pad sequences, and split data

In [4]:
maxlen = 20      # Max number of tokens in input sequence
frac_drop = 0.0  # Fraction of tokens to randomly drop from input sequences
topn = len(vocab)    # Keep only the top n words in vocabulary
test_size = 0.15   # Fraction of samples to keep out of training

In [5]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 116 ms, sys: 8 ms, total: 124 ms
Wall time: 115 ms


In [6]:
df_train, df_valid = train_test_split(df, test_size=test_size)
df_valid, df_test = train_test_split(df_valid, test_size=0.5)

df_train.shape, df_valid.shape, df_test.shape

((56139, 7), (4953, 7), (4954, 7))

In [7]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [15]:
embed_dim = 64         # embedding dimension for word vecs
gru_dim = 64           # dimension of GRU layers
num_gru = 3
bidirectional = False    # whether to use bidirectional
dense_dim = 64         # dimensionality of dense layer
dropout = 0.5       # dropout ratio
batch_size = 64         # batch size
validation_split = 0.15 # Fraction of samples to keep out for validation
max_epochs = 100         # maximum number of epochs to run for

In [16]:
# About 43% of articles are conservative, so let's weight samples accordingly

class_weight = {0: 0.57, 1: 0.43}

In [17]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_titles'.format(gru_dim, embed_dim, dense_dim, dropout, bidirectional,
                                               maxlen, topn, test_size, batch_size, frac_drop, num_gru) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_acc', patience=max_epochs)

saving checkpoints to: models/64_64_64_0.5_False_20_34414_0.15_64_0.0_3_titles_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5


In [18]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout, bidirectional)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 64)            2202560   
_________________________________________________________________
gru_4 (GRU)                  (None, 20, 64)            24768     
_________________________________________________________________
gru_5 (GRU)                  (None, 20, 64)            24768     
_________________________________________________________________
gru_6 (GRU)                  (None, 64)                24768     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
__________

### Train!

In [19]:
hist = model.fit_generator(rnn.data_gen(df_train, batch_size, frac_drop, maxlen, topn), 
                epochs=max_epochs, class_weight=class_weight, 
                steps_per_epoch=int(1.*df_train.shape[0]/batch_size),
                validation_data=rnn.data_gen(df_valid, batch_size, frac_drop, maxlen, topn, validation=True),
                validation_steps=int(1.*df_valid.shape[0]/batch_size),
                callbacks=[model_checkpoint, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
165/877 [====>.........................] - ETA: 24s - loss: 0.1742 - acc: 0.8527

KeyboardInterrupt: 

### Simple evaluation metrics on hold-out test set

In [20]:
# Load the best checkpoint

model_name = 'models/64_64_64_0.5_False_20_34414_0.15_64_0.0_3_titles_002_0.52099_0.74574_titles.h5'
model = load_model(model_name)

In [21]:
X_test, Y_test = rnn.data_getter(df_test, maxlen, topn)
X_test.shape, Y_test.shape

((4954, 20), (4954,))

In [22]:
# Accuracy

scores = model.evaluate(X_test, Y_test, verbose=0)
benchmark = 100. * len(np.where(np.array(Y_test) == 1)[0]) / len(Y_test)

print('Accuracy:  {:2.1f}%'.format(scores[1] * 100.))
print('Benchmark: {:2.1f}% (guessing all label 1)'.format(benchmark))

Accuracy:  73.2%
Benchmark: 56.4% (guessing all label 1)


In [23]:
# F1 score

Y_pred = model.predict_classes(X_test, verbose=0).squeeze().tolist()
benchmark = np.ones_like(Y_test)

print('f1 score:  {:2.2f}'.format(f1_score(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing all label 1): '.format(f1_score(Y_test, benchmark)))

f1 score:  0.77
Benchmark: 0.72 (guessing all label 1): 


In [24]:
# logloss

Y_pred = model.predict(X_test, verbose=0).squeeze().tolist()
benchmark = 0.57 * np.ones_like(Y_test)

print('Log loss:  {:2.2f}'.format(log_loss(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing 0.57 probability "liberal"): '.format(log_loss(Y_test, benchmark)))

Log loss:  0.54
Benchmark: 0.68 (guessing 0.57 probability "liberal"): 


Definitely better than random! `:)`