In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from sklearn.preprocessing import LabelBinarizer
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.


In [2]:
#%autoreload
#import rnn

In [3]:
np.random.seed(42)

### Read in data

In [4]:
%%time

#filepath = '../data/data-False-3.pkl'
filepath = '../data/data-True-0.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.53 s, sys: 40 ms, total: 1.57 s
Wall time: 1.57 s


### Limit vocab size, pad sequences, and split data

In [5]:
maxlen = 20      # Max number of tokens in input sequence
frac_drop = 0.0  # Fraction of tokens to randomly drop from input sequences
topn = 20000     # Keep only the top n words in vocabulary
test_size = 0.0   # Fraction of samples to keep out of training

In [6]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 156 ms, sys: 16 ms, total: 172 ms
Wall time: 155 ms


In [7]:
# df_train, df_valid = train_test_split(df, test_size=test_size)
# df_valid, df_test = train_test_split(df_valid, test_size=0.5)

# df_train.shape, df_valid.shape, df_test.shape

In [8]:
X_train = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn,  
                        padding='post', truncating='post') 

X_train, X_test, Y_train, Y_test = train_test_split(X_train, df['encoded_label'], test_size=test_size)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((66659, 20), (0, 20), (66659,), (0,))

In [9]:
lb = LabelBinarizer()
Y_train = lb.fit_transform(Y_train)
#Y_test = lb.transform(Y_test)

Y_train.shape #Y_test.shape

(66659, 32)

In [10]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [11]:
embed_dim = 128         # embedding dimension for word vecs
gru_dim = 64           # dimension of GRU layers
num_gru = 1
bidirectional = False    # whether to use bidirectional
dense_dim = 64         # dimensionality of dense layer
dropout = 0.5       # dropout ratio
batch_size = 64         # batch size
validation_split = 0.1 # Fraction of samples to keep out for validation
max_epochs = 50         # maximum number of epochs to run for

In [12]:
# About 43% of articles are conservative, so let's weight samples accordingly

weights = []
for i in lb.classes_:
    weights.append(len(np.where(np.argmax(Y_train, axis=1) == i)[0]))
    
max_weight = max(weights)
weights = [1.*x/max_weight for x in weights]
class_weight = {k: w for k, w in zip(lb.classes_, weights)}

class_weight

{0: 0.09752891692954785,
 1: 0.11054153522607782,
 2: 0.4914563617245005,
 3: 0.014327024185068349,
 4: 0.601472134595163,
 5: 0.41863827549947424,
 6: 1.0,
 7: 0.2020241850683491,
 8: 0.32018927444794953,
 9: 0.09844900105152471,
 10: 0.5774185068349106,
 11: 0.15956887486855942,
 12: 0.11251314405888538,
 13: 0.06440588853838065,
 14: 0.10147213459516298,
 15: 0.3325446898002103,
 16: 0.15036803364879076,
 17: 0.4877760252365931,
 18: 0.2631440588853838,
 19: 0.1922975814931651,
 20: 0.3096740273396425,
 21: 0.08845951629863302,
 22: 0.16548370136698212,
 23: 0.10672975814931651,
 24: 0.10134069400630914,
 25: 0.6330178759200841,
 26: 0.26813880126182965,
 27: 0.5297055730809674,
 28: 0.22502628811777076,
 29: 0.19663512092534174,
 30: 0.2576235541535226,
 31: 0.08372765509989485}

In [13]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_titles'.format(gru_dim, embed_dim, dense_dim, dropout, bidirectional,
                                               maxlen, topn, test_size, batch_size, frac_drop, num_gru, len(lb.classes_)) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=5)

saving checkpoints to: models/64_128_64_0.5_False_20_20000_0.0_64_0.0_1_32_titles_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5


In [14]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout,
                               output_size=len(lb.classes_), bidirectional=bidirectional)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 128)           2560128   
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
Total params: 2,603,424
Trainable params: 2,603,424
Non-trainable params: 0
_________________________________________________________________


### Train!

In [15]:
hist = model.fit(X_train, Y_train, epochs=max_epochs, validation_split=0.2,
                 callbacks=[model_checkpoint, early_stopping],
                 class_weight=class_weight)

Train on 53327 samples, validate on 13332 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
