In [1]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
#import ktrain

In [2]:
def pad_or_cut(x, max_len):
    # Cut to the maximum length; computationally cheaper than testing
    x = x[0:max_len,:]
    
    # Pad with zeros
    len_diff = max_len - len(x)
    if len_diff > 0:
        x = np.concatenate((x, np.zeros((len_diff, x.shape[1]))))
        
    return x

In [10]:
def build_embedding_list(data_path, max_len='max'):
    # Load the data
    fname = data_path
    with open(fname, 'rb') as fp:
        df = pickle.load(fp)
    
    # Break into train and test
    train_mask = df.doc_use == 'train'
    train = df[train_mask]
    test = df[~train_mask]
    
    # Stack documents
    train_embeddings = [np.stack(train.embeddings[train.docid == ID]) for ID in train.docid.unique()]
    test_embeddings = [np.stack(test.embeddings[test.docid == ID]) for ID in test.docid.unique()]
    
    # Pad documents
    if max_len == 'max':
        max_len = max([len(doc) for doc in train_embeddings])
    
    x_train = [pad_or_cut(doc, max_len) for doc in train_embeddings]
    x_train = np.stack(x_train)
    
    x_test = [pad_or_cut(doc, max_len) for doc in test_embeddings]
    x_test = np.stack(x_test)
    
    y_train = train.groupby('docid').first()['label'].values
    y_test = test.groupby('docid').first()['label'].values
    
    return x_train, y_train, x_test, y_test

In [13]:
x_train, y_train, x_test, y_test = build_embedding_list('data/distilBert_embeddings_stack_200.pkl')

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1], 768)),
    tf.keras.layers.Conv1D(filters=5, kernel_size=3, padding='same'),
    tf.keras.layers.MaxPool1D(pool_size=1),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 82, 5)             11525     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 82, 5)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 410)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 411       
Total params: 11,936
Trainable params: 11,936
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [17]:
model.fit(x=x_train, y=y_train)

Train on 641 samples


<tensorflow.python.keras.callbacks.History at 0x7fd317443110>

In [18]:
preds = model.predict_classes(x_test)
print('accuracy on test set for 200 stacked = ', 
      sum(1 for x,y in zip(preds, y_test) if x == y) / len(preds))

accuracy on test set for 200 stacked =  0.4444444444444444
