In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.



In [13]:
%autoreload
import rnn

In [2]:
np.random.seed(42)

### Read in data

In [3]:
%%time

#filepath = '../data/data-False-3.pkl'
filepath = '../data/data-True-0.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 3.39 s, sys: 78.2 ms, total: 3.47 s
Wall time: 3.47 s


### Limit vocab size, pad sequences, and split data

In [4]:
maxlen = 20      # Max number of tokens in input sequence
frac_drop = 0.0  # Fraction of tokens to randomly drop from input sequences
topn = 5000    # Keep only the top n words in vocabulary
test_size = 0.1   # Fraction of samples to keep out of training

In [5]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 121 ms, sys: 7.12 ms, total: 128 ms
Wall time: 124 ms


In [6]:
# df_train, df_valid = train_test_split(df, test_size=test_size)
# df_valid, df_test = train_test_split(df_valid, test_size=0.5)

# df_train.shape, df_valid.shape, df_test.shape

In [7]:
X_train = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn,  
                        padding='post', truncating='post') 

X_train, X_test, Y_train, Y_test = train_test_split(X_train, df['encoded_label'], test_size=test_size)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((59441, 20), (6605, 20), (59441,), (6605,))

In [8]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [9]:
embed_dim = 128         # embedding dimension for word vecs
gru_dim = 16           # dimension of GRU layers
num_gru = 5
bidirectional = False    # whether to use bidirectional
dense_dim = 64         # dimensionality of dense layer
dropout = 0.5       # dropout ratio
batch_size = 64         # batch size
validation_split = 0.1 # Fraction of samples to keep out for validation
max_epochs = 10         # maximum number of epochs to run for

In [10]:
# About 43% of articles are conservative, so let's weight samples accordingly

class_weight = {0: 0.57, 1: 0.43}

In [11]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_titles'.format(gru_dim, embed_dim, dense_dim, dropout, bidirectional,
                                               maxlen, topn, test_size, batch_size, frac_drop, num_gru) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_acc', patience=max_epochs)

saving checkpoints to: models/16_128_64_0.5_False_20_5000_0.1_64_0.0_5_titles_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5


In [12]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout, bidirectional)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 128)           640128    
_________________________________________________________________
gru_1 (GRU)                  (None, 20, 16)            6960      
_________________________________________________________________
gru_2 (GRU)                  (None, 20, 16)            1584      
_________________________________________________________________
gru_3 (GRU)                  (None, 20, 16)            1584      
_________________________________________________________________
gru_4 (GRU)                  (None, 20, 16)            1584      
_________________________________________________________________
gru_5 (GRU)                  (None, 16)                1584      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
__________

### Train!

In [14]:
# hist = model.fit_generator(rnn.data_gen(df_train, batch_size, frac_drop, maxlen, topn), 
#                 epochs=max_epochs, class_weight=class_weight, 
#                 steps_per_epoch=int(1.*df_train.shape[0]/batch_size),
#                 validation_data=rnn.data_gen(df_valid, batch_size, frac_drop, maxlen, topn, validation=True),
#                 validation_steps=int(1.*df_valid.shape[0]/batch_size),
#                 callbacks=[model_checkpoint, early_stopping])

hist = model.fit(X_train, Y_train, epochs=max_epochs, validation_split=0.1,
                 callbacks=[model_checkpoint, early_stopping])

Train on 53496 samples, validate on 5945 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

### Simple evaluation metrics on hold-out test set

In [37]:
# Load the best checkpoint

model_name = 'models/64_64_64_0.5_False_20_34414_0.1_64_0.0_1_titles_001_0.52713_0.73474_titles.h5'
model = load_model(model_name)

In [21]:
# X_test, Y_test = rnn.data_getter(df_test, maxlen, topn)
# X_test.shape, Y_test.shape

((4954, 20), (4954,))

In [38]:
# Accuracy

scores = model.evaluate(X_test, Y_test, verbose=0)
benchmark = 100. * len(np.where(np.array(Y_test) == 1)[0]) / len(Y_test)

print('Accuracy:  {:2.1f}%'.format(scores[1] * 100.))
print('Benchmark: {:2.1f}% (guessing all label 1)'.format(benchmark))

Accuracy:  73.3%
Benchmark: 57.0% (guessing all label 1)


In [39]:
# F1 score

Y_pred = model.predict_classes(X_test, verbose=0).squeeze().tolist()
benchmark = np.ones_like(Y_test)

print('f1 score:  {:2.2f}'.format(f1_score(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing all label 1): '.format(f1_score(Y_test, benchmark)))

f1 score:  0.78
Benchmark: 0.73 (guessing all label 1): 


In [40]:
# logloss

Y_pred = model.predict(X_test, verbose=0).squeeze().tolist()
benchmark = 0.57 * np.ones_like(Y_test)

print('Log loss:  {:2.2f}'.format(log_loss(Y_test, Y_pred)))
print('Benchmark: {:2.2f} (guessing 0.57 probability "liberal"): '.format(log_loss(Y_test, benchmark)))

Log loss:  0.53
Benchmark: 0.68 (guessing 0.57 probability "liberal"): 


Definitely better than random! `:)`