In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import ktrain

In [None]:
def pad_or_cut(x, max_len):
    # Cut to the maximum length; computationally cheaper than testing
    x = x[0:max_len,:]
    
    # Pad with zeros
    len_diff = max_len - len(x)
    if len_diff > 0:
        x = np.concatenate((x, np.zeros((len_diff, x.shape[1]))))
        
    return x

In [None]:
def build_embedding_list(data_path, max_len='max'):
    # Load the data
    fname = data_path
    with open(fname, 'rb') as fp:
        df = pickle.load(fp)
    
    # Break into train and test
    train_mask = df.doc_use == 'train'
    train = df[train_mask]
    test = df[~train_mask]
    
    # Stack documents
    train_embeddings = [np.stack(train.embeddings[train.docid == ID]) for ID in train.docid.unique()]
    test_embeddings = [np.stack(test.embeddings[test.docid == ID]) for ID in test.docid.unique()]
    
    # Pad documents
    if max_len == 'max':
        max_len = max([len(doc) for doc in train_embeddings])
    
    x_train = [pad_or_cut(doc, max_len) for doc in train_embeddings]
    x_train = np.stack(x_train)
    
    x_test = [pad_or_cut(doc, max_len) for doc in test_embeddings]
    x_test = np.stack(x_test)
    
    return x_train, x_test

In [None]:
x_train, x_test = build_embedding_list('./w266_project/data/distilBert_embeddings_stack512.pkl')

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32, 768)),
    tf.keras.layers.Conv1D(filters=5, kernel_size=3, padding='same'),
    tf.keras.layers.MaxPool1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
y_train = np.ones(641)
y_test = np.ones(162)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [None]:
model.fit(x=x_train, y=y_train)