In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class Cnn1DLinear(nn.Module):
    def __init__(self, 
                # embedding params
                vocabSize = None,
                embeddingDim = 64,
                paddingIdx = 0,
                # conv params
                filterSizes = [2, 3, 4, 5],
                numFilters = [128, 128, 128, 128],
                batchNormConv = False,
                convPadding = 0, # default
                # ffnn params
                hiddenNeurons = [512, 256],
                batchNormFFNN = False,
                dropout = 0.5,
                # pretrain layers
                pretrainLayers = [512, 1024],
                numClasses = 1): # binary classification
        super().__init__()
        self.__name__ = "Cnn1DLinear"
        # embdding
        self.embedding = nn.Embedding(vocabSize, 
                                  embeddingDim, 
                                  padding_idx=paddingIdx)

        # convolutions
        self.conv1dModule = nn.ModuleList()
        for i in range(len(filterSizes)):
                if batchNormConv:
                    module = nn.Sequential(
                                nn.Conv1d(in_channels=embeddingDim,
                                    out_channels=numFilters[i],
                                    kernel_size=filterSizes[i]),
                                nn.BatchNorm1d(numFilters[i])
                            )
                else:
                    module = nn.Conv1d(in_channels=embeddingDim,
                                    out_channels=numFilters[i],
                                    kernel_size=filterSizes[i],
                                    #padding=filterSizes[i]//2
                                    padding=convPadding
                                )
                self.conv1dModule.append(module)
        convOut = np.sum(numFilters)
        
        # core ffnn
        self.ffnn = []
        for i,h in enumerate(hiddenNeurons):
            self.ffnnBlock = []
            if i == 0:
                self.ffnnBlock.append(nn.Linear(convOut, h))
            else:
                self.ffnnBlock.append(nn.Linear(hiddenNeurons[i-1], h))

            # add BatchNorm to every layer except last
            if batchNormFFNN and i < len(hiddenNeurons)-1:
                self.ffnnBlock.append(nn.BatchNorm1d(h))

            self.ffnnBlock.append(nn.ReLU())

            if dropout:
                self.ffnnBlock.append(nn.Dropout(dropout))
            
            self.ffnn.append(nn.Sequential(*self.ffnnBlock))
        self.ffnn = nn.Sequential(*self.ffnn)
        
        # classification output
        self.fcOutput = nn.Linear(hiddenNeurons[-1], numClasses)

        # pretrain layers
        self.preTrainLayers = []
        for i, h in enumerate(pretrainLayers):
            self.preTrainBlock = []
            if i == 0:
                self.preTrainBlock.append(nn.Linear(hiddenNeurons[-1], h))                
            else:
                self.preTrainBlock.append(nn.Linear(pretrainLayers[i-1], h))
            self.preTrainBlock.append(nn.ReLU())
            if dropout:
                self.preTrainBlock.append(nn.Dropout(dropout))
            self.preTrainLayers.append(nn.Sequential(*self.preTrainBlock))
        self.preTrainLayers.append(nn.Linear(pretrainLayers[-1], vocabSize))
        self.preTrainLayers = nn.Sequential(*self.preTrainLayers)

    @staticmethod
    def convAndMaxPool(x, conv):
        """Convolution and global max pooling layer"""
        # conv(x).permute(0, 2, 1) is of shape (batch_size, sequence_length, num_filters)
        # max(1)[0] is of shape (batch_size, num_filters)
        return F.relu(conv(x).permute(0, 2, 1).max(1)[0])

    def core(self, inputs):
        embedded = self.embedding(inputs).permute(0, 2, 1)
        x_conv = [self.convAndMaxPool(embedded, conv1d) for conv1d in self.conv1dModule]
        # x_conv: list of tensors of shape (batch_size, num_filters)
        # torch.cat(x_conv, dim=1) is of shape (batch_size, num_filters * len(kernel_sizes))
        x_fc = self.ffnn(torch.cat(x_conv, dim=1))
        return x_fc

    def pretrain(self, inputs):
        x_core = self.core(inputs)
        return self.preTrainLayers(x_core)

    def forward(self, inputs):
        x_core = self.core(inputs)
        return self.fcOutput(x_core)


In [2]:
import sys
sys.path.append('../..')
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nebula.evaluation import get_tpr_at_fpr
from nebula.pretraining import maskSequenceArr

import pickle
import logging
import numpy as np
from pandas import DataFrame
from tqdm import tqdm
from collections import defaultdict

def pretrain(U_masked, U_target, model, batchSize, pretrinEpochs, device, verbosityBatches=50):
    # make a loader from U_masked and U_target
    preTrainLoader = torch.utils.data.DataLoader(
        # create dataset from U_masked and U_target numpy arrays
        torch.utils.data.TensorDataset(torch.from_numpy(U_masked), torch.from_numpy(U_target)),
        batch_size=batchSize,
        shuffle=True
    )

    lossFunction = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # pre-train model
    for epoch in range(1, pretrinEpochs+1):
        for batch_idx, (data, target) in enumerate(preTrainLoader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            
            # forward pass
            pred_masked_vocab = model.pretrain(data)
            loss = lossFunction(pred_masked_vocab, target.float())
            
            loss.backward()
            optimizer.step()

            if batch_idx % verbosityBatches == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(preTrainLoader.dataset),
                    100. * batch_idx / len(preTrainLoader), loss.item()))


def downstream(L_x, L_y, model, batchSize, downstreamEpochs, device, verbosityBatches=50):
    # make a loader from L_x and L_y
    downstreamLoader = torch.utils.data.DataLoader(
        # create dataset from L_x and L_y numpy arrays
        torch.utils.data.TensorDataset(torch.from_numpy(L_x), torch.from_numpy(L_y)),
        batch_size=batchSize,
        shuffle=True
    )

    # train model
    logging.warning('Training model...')
    lossFunction = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(1, downstreamEpochs+1):
        for batch_idx, (data, target) in enumerate(downstreamLoader):
            data, target = data.to(device), target.to(device).reshape(-1, 1)
            optimizer.zero_grad()
            
            # forward pass
            logits = model(data)
            loss = lossFunction(logits, target.float())

            loss.backward()
            optimizer.step()
            
            y_pred_probs = torch.sigmoid(logits).clone().detach().cpu().numpy()
            f1 = f1_score(target.cpu().detach().numpy(), y_pred_probs > 0.5, average='macro')
            if batch_idx % verbosityBatches == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tF1: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(downstreamLoader.dataset),
                    100. * batch_idx / len(downstreamLoader), loss.item(), f1))

def evaluate(model, x_test, y_test, fprs, device, batchSize):

    testLoader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(torch.from_numpy(x_test).long(), torch.from_numpy(y_test).float()),
            batch_size=batchSize,
            shuffle=True
        )
    model.eval()

    lossFunction = nn.BCEWithLogitsLoss()
    metrics = defaultdict(lambda: defaultdict(list))
    for data, target in tqdm(testLoader):
        data, target = data.to(device), target.to(device)

        with torch.no_grad():
            logits = model(data)

        loss = lossFunction(logits, target.float().reshape(-1,1))
        y_pred_probs = torch.sigmoid(logits).clone().detach().cpu().numpy()
        
        target = target.clone().detach().cpu().numpy().reshape(-1,1)
        for fpr in fprs:
            tpr_at_fpr, threshold_at_fpr = get_tpr_at_fpr(target, y_pred_probs, fpr)
            # f1 = f1_score(y[test_index], predicted_probs >= threshold)
            f1_at_fpr = f1_score(target, y_pred_probs >= threshold_at_fpr)
            metrics[fpr]["tpr"].append(tpr_at_fpr)
            metrics[fpr]["f1"].append(f1_at_fpr)
            metrics[fpr]["loss"].append(loss.item())
    # take the mean of the metrics
    for fpr in metrics:
        metrics[fpr] = DataFrame(metrics[fpr])
        # add std for each metric - tpr, f1, loss
        metrics[fpr]["tpr_std"] = metrics[fpr]["tpr"].std()
        metrics[fpr]["f1_std"] = metrics[fpr]["f1"].std()
        metrics[fpr]["loss_std"] = metrics[fpr]["loss"].std()
        # take the mean of the metrics
        metrics[fpr] = metrics[fpr].mean(axis=0)
        metrics[fpr] = metrics[fpr].to_dict()
    return metrics


def cef_ssl(x, y, x_test, y_test, vocab, modelClass, modelConfig, fprs=[0.0001, 0.001, 0.01, 0.1], test_size=0.2, random_state=42, batchSize=32, pretrinEpochs=3, downstreamEpochs=3, device='cpu', verbosityBatches=50, mask_probability=0.15):
    
    # split x and y into train and validation sets
    U, L_x, _, L_y = train_test_split(x, y, test_size=test_size, random_state=random_state)

    # for each sequence in U, mask it
    logging.warning('Masking sequences...')
    U_masked, U_target = maskSequenceArr(U, vocab, mask_probability=0.15, random_state=None)

    # pre-train model
    logging.warning('Pre-training model...')
    model_Pretrained = modelClass(**modelConfig)
    model_Pretrained.to(device)
    pretrain(U_masked, U_target, model_Pretrained, batchSize, pretrinEpochs, device, verbosityBatches)

    # downstream task for pretrained model
    logging.warning('Training pre-trained model on downstream task...')
    downstream(L_x, L_y, model_Pretrained, batchSize, downstreamEpochs, device, verbosityBatches)
    
    # downstream task for new model
    logging.warning('Training new model on downstream task...')
    model_NonPretrained = modelClass(**modelConfig)
    model_NonPretrained.to(device)
    downstream(L_x, L_y, model_NonPretrained, batchSize, downstreamEpochs, device, verbosityBatches)

    # downstream task for new model on full dataset suitable for benchmarking
    logging.warning('Training new model on downstream task on full dataset...')
    model_Full = modelClass(**modelConfig)
    model_Full.to(device)
    downstream(x, y, model_Full, batchSize, downstreamEpochs, device, verbosityBatches)

    logging.warning('Evaluating all models on test set...')
    # get fpr and f1 on test set on pretrained model
    metrics_Pretrained = evaluate(model_Pretrained, x_test, y_test, fprs, device, batchSize)
    # get performance metrics on fresh model
    metrics_nonPretrained = evaluate(model_NonPretrained, x_test, y_test, fprs, device, batchSize)
    # get performance metrics on full model
    metrics_full = evaluate(model_Full, x_test, y_test, fprs, device, batchSize)
    
    return metrics_Pretrained, metrics_nonPretrained, metrics_full

train_limit = 5000

xTrainFile = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_trainset_WithAPIargs\speakeasy_VocabSize_10000_maxLen_2048_x.npy"
xTrain = np.load(xTrainFile)[:train_limit]
yTrainFile = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_trainset_WithAPIargs\speakeasy_y.npy"
yTrain = np.load(yTrainFile)[:train_limit]
xTestFile = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_testset_WithAPIargs\speakeasy_VocabSize_10000_maxLen_2048_x.npy"
xTest = np.load(xTestFile)
yTestFile = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_testset_WithAPIargs\speakeasy_y.npy"
yTest = np.load(yTestFile)

vocabFile = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_trainset_WithAPIargs\speakeasy_VocabSize_10000.pkl"
with open(vocabFile, 'rb') as f:
    vocab = pickle.load(f)

modelConfig = {
    "vocabSize": len(vocab)
}
modelClass = Cnn1DLinear
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

metrics_Pretrained, metrics_nonPretrained, metrics_full = cef_ssl(xTrain, yTrain, xTest, yTest, vocab, modelClass, modelConfig, device=device, pretrinEpochs=1, downstreamEpochs=1, verbosityBatches=100, batchSize=256)

100%|██████████| 4000/4000 [00:01<00:00, 2835.67it/s]
















100%|██████████| 68/68 [00:06<00:00, 11.30it/s]
100%|██████████| 68/68 [00:06<00:00, 11.33it/s]
100%|██████████| 68/68 [00:06<00:00, 11.30it/s]


In [3]:
metrics = dict(zip(['pretrined with MLM', 'non pretrained, same data', 'full dataset'], [metrics_Pretrained, metrics_nonPretrained, metrics_full]))
for m in metrics:
    for fpr in metrics[m]:
        print("FPR: ", fpr)
        print(f"{metrics[m][fpr]['tpr']:.2f} +- {metrics[m][fpr]['tpr_std']:.2f} -- {m}")


0.29 +- 0.06 -- pretrined with MLM
0.14 +- 0.05 -- non pretrained, same data
0.00 +- 0.00 -- full dataset
