In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.


In [2]:
np.random.seed(42)

In [3]:
%%time

filepath = '../data/title-1-True-clean.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.49 s, sys: 36 ms, total: 1.52 s
Wall time: 1.52 s


In [4]:
maxlen = 20        # Max number of tokens in input sequence
topn = 6747        # Keep only the top n words in vocabulary

In [5]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 116 ms, sys: 8 ms, total: 124 ms
Wall time: 115 ms


In [6]:
X = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn,  
                  padding='post', truncating='post')

In [7]:
y = df['encoded_label']

y.shape

(66658,)

In [8]:
# Delete the dataframe, we are done with it for now!
del df

In [9]:
embed_dim = 256          # embedding dimension for word vecs
num_gru = 1              # number of GRUs to use in serial
gru_dim = 256            # dimension of GRU layers
gru_activation = 'tanh'  # activation function for GRU layer
bidirectional = False    # whether to use bidirectional
dense_dim = 256          # dimensionality of dense layer
dropout = 0.5            # dropout ratio
batch_size = 64          # batch size
validation_split = 0.1   # Fraction of samples to keep out for validation
max_epochs = 50          # maximum number of epochs to run for

In [10]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(gru_dim, num_gru, embed_dim, dense_dim,  
                dropout, bidirectional, maxlen, topn, batch_size, 1, gru_activation) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_cleaned.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

saving checkpoints to: models/256_1_256_256_0.5_False_20_6747_64_1_tanh_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_cleaned.h5


In [11]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout,
                               bidirectional, 1, gru_activation)
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 256)           1727488   
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 2,187,521
Trainable params: 2,187,521
Non-trainable params: 0
_________________________________________________________________


In [12]:
hist = model.fit(X, y, epochs=max_epochs, validation_split=validation_split,
                 callbacks=[model_checkpoint, early_stopping])

Train on 59992 samples, validate on 6666 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
