In [1]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import ktrain
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
def pad_or_cut(x, max_len):
    """
    Either pads or cuts an document's embedding matrix.
    """
    # Cut to the maximum length; cheaper than testing
    x = x[0:max_len,:]
    
    # Pad with zeros
    len_diff = max_len - len(x)
    if len_diff > 0:
        x = np.concatenate((x, np.zeros((len_diff, x.shape[1]))))
        
    return x

In [76]:
def build_embedded_text(data_path, max_len='max',
                        val_split=0.1, random_state=42):
    """
    Returns training, validation, and test sets.
    
    Args:
        len -> either str or int.  If str, must be 'max'.
               'max' indicates that the documents should be
               padded to the max document length in the training
               set.
        val_split -> either float or bool.  If bool, must be
                     False. If False, there is no true train/val
                     split.  Rather, the test set is returned as
                     the val set for consistency.
    """
    # Load the data
    fname = data_path
    with open(fname, 'rb') as fp:
        df = pickle.load(fp)
    
    # Break into train and test
    train_mask = df.doc_use == 'train'
    train = df[train_mask]
    test = df[~train_mask]
    
    # Stack documents
    train_embeddings = [np.stack(train.embeddings[train.docid == ID]) for ID in train.docid.unique()]
    test_embeddings = [np.stack(test.embeddings[test.docid == ID]) for ID in test.docid.unique()]
    
    # Pad documents
    if max_len == 'max':
        max_len = max([len(doc) for doc in train_embeddings])
    
    x_train = [pad_or_cut(doc, max_len) for doc in train_embeddings]
    x_train = np.stack(x_train)#.transpose(0, 2, 1)
    
    x_test = [pad_or_cut(doc, max_len) for doc in test_embeddings]
    x_test = np.stack(x_test)#.transpose(0, 2, 1)
    
    y_train = train.groupby('docid').first()['label'].values
    y_test = test.groupby('docid').first()['label'].values
    
    # Build a validation set from the training set
    if val_split is False:
        return x_train, y_train, x_test, y_test, x_test, y_test
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=val_split,
                                                      random_state=random_state)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [77]:
def evaluate_model(model, x_train, y_train,
                   x_val, y_val,
                   x_test, y_test,
                   optimizer, loss, metrics,
                   lr, eval_lr=False):
    """
    Runs a model evaluation
    """
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)
    
    learner = ktrain.get_learner(model,
                                train_data=(x_train, y_train),
                                val_data=(x_val, y_val))
    learner.reset_weights()
    
    if eval_lr:
        learner.lr_find(show_plot=True)
        plt.show()
    
    learner.autofit(lr, reduce_on_plateau=5, early_stopping=10)
    
    y_hat = learner.model.predict(x_test).flatten() > 0.5
    
    print(classification_report(y_test, y_hat))

In [78]:
x_train, y_train, x_val, y_val, x_test, y_test = build_embedded_text('data/baseBert_embeddings_olap_200.pkl', val_split=False)

In [79]:
embeddings = tf.keras.layers.Input(shape=(x_train.shape[1], x_train.shape[2]))
conv2 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=2,
                               activation='relu')(embeddings)
maxpool2 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-1)(conv2)

conv3 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=3,
                               activation='relu')(embeddings)
maxpool3 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-2)(conv3)

conv4 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=4,
                               activation='relu')(embeddings)
maxpool4 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-3)(conv4)

conv5 = tf.keras.layers.Conv1D(filters=32,
                               kernel_size=5,
                               activation='relu')(embeddings)
maxpool5 = tf.keras.layers.MaxPool1D(pool_size=x_train.shape[1]-4)(conv5)

concat = tf.keras.layers.concatenate([maxpool2, maxpool3, maxpool4, maxpool5])

dense1 = tf.keras.layers.Dense(64, activation='relu')(concat)
dropout = tf.keras.layers.Dropout(0.1)(dense1)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

In [80]:
model = tf.keras.Model(inputs=embeddings, outputs=outputs)

In [81]:
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 109, 768)]   0                                            
__________________________________________________________________________________________________
conv1d_44 (Conv1D)              (None, 108, 32)      49184       input_12[0][0]                   
__________________________________________________________________________________________________
conv1d_45 (Conv1D)              (None, 107, 32)      73760       input_12[0][0]                   
__________________________________________________________________________________________________
conv1d_46 (Conv1D)              (None, 106, 32)      98336       input_12[0][0]                   
___________________________________________________________________________________________

In [82]:
evaluate_model(model, x_train, y_train, x_val, y_val, x_test, y_test,
               optimizer='adam', loss='binary_crossentropy', metrics=['acc'],
               lr=10e-6, eval_lr=False)

Model weights have been reset.


begin training using triangular learning rate policy with max lr of 1e-05...
Train on 641 samples, validate on 162 samples
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 00006: Reducing Max LR on Plateau: new max lr will be 5e-06 (if not early_stopping).
Epoch 7/1024
Epoch 8/1024
Epoch 9/1024
Epoch 10/1024
Epoch 11/1024
Epoch 00011: Reducing Max LR on Plateau: new max lr will be 2.5e-06 (if not early_stopping).
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Weights from best epoch have been loaded into model.
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.60      1.00      0.75        97

    accuracy                           0.60       162
   macro avg       0.30      0.50      0.37       162
weighted avg       0.36      0.60      0.45       162



  'precision', 'predicted', average, warn_for)
