In [4]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import ktrain
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
def pad_or_cut(x, max_len):
    """
    Either pads or cuts an document's embedding matrix.
    """
    # Cut to the maximum length; cheaper than testing
    x = x[0:max_len,:]
    
    # Pad with zeros
    len_diff = max_len - len(x)
    if len_diff > 0:
        x = np.concatenate((x, np.zeros((len_diff, x.shape[1]))))
        
    return x

In [6]:
def build_embedded_text(data_path, max_len='max',
                        val_split=0.1, random_state=42):
    """
    Returns training, validation, and test sets.
    
    Args:
        len -> either str or int.  If str, must be 'max'.
               'max' indicates that the documents should be
               padded to the max document length in the training
               set.
        val_split -> either float or bool.  If bool, must be
                     False. If False, there is no true train/val
                     split.  Rather, the test set is returned as
                     the val set for consistency.
    """
    # Load the data
    fname = data_path
    with open(fname, 'rb') as fp:
        df = pickle.load(fp)
    
    # Break into train and test
    train_mask = df.doc_use == 'train'
    train = df[train_mask]
    test = df[~train_mask]
    
    # Stack documents
    train_embeddings = [np.stack(train.embeddings[train.docid == ID]) for ID in train.docid.unique()]
    test_embeddings = [np.stack(test.embeddings[test.docid == ID]) for ID in test.docid.unique()]
    
    # Pad documents
    if max_len == 'max':
        max_len = max([len(doc) for doc in train_embeddings])
    
    x_train = [pad_or_cut(doc, max_len) for doc in train_embeddings]
    x_train = np.stack(x_train)#.transpose(0, 2, 1)
    
    x_test = [pad_or_cut(doc, max_len) for doc in test_embeddings]
    x_test = np.stack(x_test)#.transpose(0, 2, 1)
    
    y_train = train.groupby('docid').first()['label'].values
    y_test = test.groupby('docid').first()['label'].values
    
    # Build a validation set from the training set
    if val_split is False:
        return x_train, y_train, x_test, y_test, x_test, y_test
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_split,
                                                      random_state=random_state)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [7]:
def evaluate_model(model, x_train, y_train,
                   x_val, y_val,
                   x_test, y_test,
                   optimizer, loss, metrics,
                   lr, eval_lr=False):
    """
    Runs a model evaluation
    """
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)
    
    learner = ktrain.get_learner(model,
                                train_data=(x_train, y_train),
                                val_data=(x_val, y_val))
    learner.reset_weights()
    
    if eval_lr:
        learner.lr_find(show_plot=True)
        plt.show()
    
    learner.autofit(lr, reduce_on_plateau=5, early_stopping=10)
    
    y_hat = learner.model.predict(x_test).flatten() > 0.5
    
    print(classification_report(y_test, y_hat))

In [9]:
x_train, y_train, x_val, y_val, x_test, y_test = build_embedded_text('../data/baseBert_embeddings_olap_200.pkl', val_split=False)

In [10]:
embeddings = tf.keras.layers.Input(shape=(x_train.shape[1], x_train.shape[2]))
conv1 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=1,
                               activation='relu')(embeddings)
maxpool1 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1])(conv1)
conv2 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=2,
                               activation='relu')(embeddings)
maxpool2 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-1)(conv2)

conv3 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=3,
                               activation='relu')(embeddings)
maxpool3 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-2)(conv3)

conv4 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=4,
                               activation='relu')(embeddings)
maxpool4 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-3)(conv4)

conv5 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=5,
                               activation='relu')(embeddings)
maxpool5 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-4)(conv5)

concat = tf.keras.layers.concatenate([maxpool1, maxpool2, maxpool3, maxpool4, maxpool5])

dense1 = tf.keras.layers.Dense(64, activation='relu')(concat)
dropout = tf.keras.layers.Dropout(0.1)(dense1)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

In [3]:
model = tf.keras.Model(inputs=embeddings, outputs=outputs)

NameError: name 'tf' is not defined

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 109, 768)]   0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 109, 32)      24608       input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 108, 32)      49184       input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 107, 32)      73760       input_2[0][0]                    
______________________________________________________________________________________________

In [10]:
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test,
               optimizer='adam', loss='binary_crossentropy', metrics=['acc'],
               lr=10e-6, eval_lr=False)

Model weights have been reset.


begin training using triangular learning rate policy with max lr of 1e-05...
Train on 641 samples, validate on 162 samples
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 8/1024
Epoch 9/1024
Epoch 10/1024
Epoch 11/1024
Epoch 12/1024
Epoch 13/1024
Epoch 14/1024
Epoch 15/1024
Epoch 16/1024
Epoch 17/1024
Epoch 18/1024
Epoch 19/1024
Epoch 20/1024
Epoch 21/1024
Epoch 22/1024
Epoch 23/1024
Epoch 24/1024
Epoch 25/1024
Epoch 26/1024
Epoch 27/1024
Epoch 28/1024
Epoch 29/1024
Epoch 30/1024
Epoch 31/1024
Epoch 32/1024
Epoch 33/1024
Epoch 34/1024
Epoch 35/1024
Epoch 36/1024
Epoch 37/1024
Epoch 38/1024
Epoch 39/1024
Epoch 00039: Reducing Max LR on Plateau: new max lr will be 5e-06 (if not early_stopping).
Epoch 40/1024
Epoch 41/1024
Epoch 42/1024
Epoch 43/1024
Epoch 44/1024
Epoch 00044: Reducing Max LR on Plateau: new max lr will be 2.5e-06 (if not early_stopping).
Restoring model weights from the end of the best epoc

  'precision', 'predicted', average, warn_for)
