In [32]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
#import ktrain

In [33]:
def pad_or_cut(x, max_len):
    # Cut to the maximum length; computationally cheaper than testing
    x = x[0:max_len,:]
    
    # Pad with zeros
    len_diff = max_len - len(x)
    if len_diff > 0:
        x = np.concatenate((x, np.zeros((len_diff, x.shape[1]))))
        
    return x

In [38]:
def build_embedding_list(data_path, max_len='max'):
    # Load the data
    fname = data_path
    with open(fname, 'rb') as fp:
        df = pickle.load(fp)
    
    # Break into train and test
    train_mask = df.doc_use == 'train'
    train = df[train_mask]
    test = df[~train_mask]
    
    # Stack documents
    train_embeddings = [np.stack(train.embeddings[train.docid == ID]) for ID in train.docid.unique()]
    test_embeddings = [np.stack(test.embeddings[test.docid == ID]) for ID in test.docid.unique()]
    
    # Pad documents
    if max_len == 'max':
        max_len = max([len(doc) for doc in train_embeddings])
    
    x_train = [pad_or_cut(doc, max_len) for doc in train_embeddings]
    x_train = np.stack(x_train)
    
    x_test = [pad_or_cut(doc, max_len) for doc in test_embeddings]
    x_test = np.stack(x_test)
    
    return df, x_train, x_test

In [89]:
df, x_train, x_test = build_embedding_list('data/distilBert_embeddings_stack_512.pkl')

In [90]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1], 768)),
    tf.keras.layers.Conv1D(filters=5, kernel_size=3, padding='same'),
    tf.keras.layers.MaxPool1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [91]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 32, 5)             11525     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 16, 5)             0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 80)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 81        
Total params: 11,606
Trainable params: 11,606
Non-trainable params: 0
_________________________________________________________________


In [92]:
y_train = np.ones(641)
y_test = np.ones(162)

In [93]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [94]:
model.fit(x=x_train, y=y_train)

Train on 641 samples


<tensorflow.python.keras.callbacks.History at 0x7fa1c565c450>

In [97]:
test_preds = model.predict_classes(x_test)
test_y = df[df['doc_use'] == 'test'].groupby('docid').first()['label'].to_list()
len(test_preds)
print('accuracy on test set for 200 stacked = ', 
      sum(1 for x,y in zip(test_preds,test_y) if x == y) / len(test_preds))

162

In [96]:
print('accuracy on test set for 200 stacked = ', 
      sum(1 for x,y in zip(test_preds,test_y) if x == y) / len(test_preds))

accuracy on test set for 200 stacked =  0.5987654320987654
