In [1]:
from __future__ import print_function

import pickle
import numpy as np
import keras
from keras.preprocessing import sequence
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import utils

%matplotlib inline
%load_ext autoreload

np.random.seed(7)

Using Theano backend.


# Load data and split for training

In [2]:
# Parameters needed for training.

p = {
  'top_words': 10000,    # Keep only these top words in the vocabular
  'max_length': 100,    # Max number of words in input sequence
  'embedding_dim': 128,
  'lstm_dim': 128,
  'dropout_U': 0.0,      # LSTM dropout prob
  'dropout_W': 0.0,      # LSTM dropout prob
  'keep_prob': 0.0,      # dense output layer dropout prob
  'batch_size': 128,
  'nb_epoch': 1,
  'bidirectional': True
}

In [3]:
# Read in data.

tag = 'no-stops'

# Read dataframe.
df = pd.read_pickle('../data/final-dataframe-no-stops.pkl'.format(tag))

# Read in vocabulary dicts.
with open('../data/final-vocab-{}.pkl'.format(tag), 'r') as f:
  _ = pickle.load(f)
  word2idx = _['word2idx']
  idx2word = _['idx2word']
  ranked_list = _['ranked_list']
  del _

In [4]:
# Split indexes into train and test sets (will let keras split validation data from the training set).  
# Use a fixed random state for reproducibility.

train_idx, test_idx = train_test_split(range(df.shape[0]), train_size=0.9, random_state=42)
len(train_idx), len(test_idx)

(21106, 2346)

In [5]:
# Split into train and test sets (validation set will be done by keras).

X_train = df.ix[train_idx, 'encoded_text'].tolist()
y_train = df.ix[train_idx, 'encoded_label'].tolist()
X_train = utils.filter_top_words(X_train, p['top_words'])

X_test = df.ix[test_idx, 'encoded_text'].tolist()
y_test = df.ix[test_idx, 'encoded_label'].tolist()
X_test = utils.filter_top_words(X_test, p['top_words'])

In [6]:
# Pad the sequences and truncated if necessary.

X_train = sequence.pad_sequences(X_train, maxlen=p['max_length'], value=p['top_words'], 
            padding='post', truncating='post')
X_test = sequence.pad_sequences(X_test, maxlen=p['max_length'], value=p['top_words'],
            padding='post', truncating='post')

X_train.shape, X_test.shape

((21106, 100), (2346, 100))

# Compile RNN

In [7]:
# Create/compile keras model.

model = utils.create_rnn_model_for_training(p)
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 100, 128)      1280128     embedding_input_1[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           197376      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1)             257         dropout_1[0][0]                  
Total params: 1,477,761
Trainable params: 1,477,761
Non-trainable params: 0
_______________

In [8]:
# Fit model, keeping out 10% of original data for validation.

_ = model.fit(X_train, y_train, validation_split=1.*X_test.shape[0]/X_train.shape[0], 
      nb_epoch=p['nb_epoch'], batch_size=p['batch_size'])

Train on 18760 samples, validate on 2346 samples
Epoch 1/1


In [9]:
# Metrics on test set.

scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: {:2.2f}%'.format(scores[1] * 100.))

y_test_pred = model.predict_classes(X_test).squeeze().tolist()
print('\nf1 score: {:2.2f}'.format(f1_score(y_test, y_test_pred)))
print('Baseline f1 score (guessing all 1\'s): {:2.2f}'.format(f1_score(np.ones_like(y_test), y_test)))

Accuracy: 80.52%

f1 score: 0.85
Baseline f1 score (guessing all 1's): 0.75


In [13]:
# Save model and parameters

_ = model.save('gru-{}.model'.format(tag))
with open('gru-{}.pkl'.format(tag), 'w') as f:
  pickle.dump(p, f)