<font size="6">Imports</font>

In [34]:
import numpy as np
import pandas as pd
import torch
import math
import torch.nn as nn
import sklearn
import transformers
import warnings
from transformers import AutoTokenizer
from transformers import AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

<font size="6">Parameters</font>

In [5]:
model_name = 'distilbert-base-uncased'
batch_size = 40
cvae_epochs = 200
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.005
input_dim = 768
hidden_dim = math.floor(math.sqrt(input_dim))
latent_dim = 5

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

<font size="6">Stage-1: Text Representation</font>

In [22]:
def get_dataset(dataset, tokenizer, model):
    data = pd.read_csv(f'/kaggle/input/data-sbrp/{dataset}.csv')
    text = data["sentences"]
    labels = data["Security"]

    sentence_embeddings = []

    for sentence in text:
        input = tokenizer(sentence, return_tensors='pt', add_special_tokens=True, 
                          truncation=True, padding=True)
        input = {k: v.to(device) for k, v in input.items()}
    
        with torch.no_grad():
            output = model(**input)
    
        embeddings = output[0]
        embeddings = torch.mean(embeddings, dim=1)
        sentence_embeddings.append(embeddings)

    sentence_embeddings_tensor = torch.stack(sentence_embeddings).squeeze(1).to(device)
    labels_tensor = torch.tensor(labels.values, dtype=torch.int64, device=device)

    return sentence_embeddings_tensor, labels_tensor

<font size="6">Stage-2: CVAE Training</font>

In [10]:
class Encoder(nn.Module):
  def __init__(self, layer_dims, latent_dim):
    super(Encoder, self).__init__()

    self.encoder = nn.Sequential()
    for i in range(len(layer_dims)-1):
      self.encoder.add_module(name = f'linear_{i}', module = nn.Linear(layer_dims[i], layer_dims[i+1]))  
      self.encoder.add_module(name = f'relu_{i}', module = nn.ReLU())
    
    self.mean = nn.Linear(layer_dims[-1], latent_dim)
    self.log_var = nn.Linear(layer_dims[-1], latent_dim)

  def forward(self, x, labels):
    labels = nn.functional.one_hot(labels, num_classes=2)
    x = torch.cat((x, labels), dim=1)

    x = self.encoder(x)
    mean = self.mean(x)
    log_var = self.log_var(x)

    return mean, log_var

In [11]:
class Decoder(nn.Module):
  def __init__(self, layer_dims, input_dim):
    super(Decoder, self).__init__()

    self.decoder = nn.Sequential()
    self.decoder.add_module(name = f'linear_{0}', module = nn.Linear(input_dim, layer_dims[0]))
    self.decoder.add_module(name = f'relu_{0}', module = nn.ReLU())
    for i in range(len(layer_dims)-2):
      self.decoder.add_module(name = f'linear_{i+1}', module = nn.Linear(layer_dims[i], layer_dims[i+1]))
      self.decoder.add_module(name = f'relu_{i+1}', module = nn.ReLU())
    self.decoder.add_module(name = 'final', module=nn.Linear(layer_dims[-2], layer_dims[-1]))
    self.decoder.add_module(name = 'sigmoid', module=nn.Sigmoid())
  
  def forward(self, x, labels):
    labels = nn.functional.one_hot(labels, num_classes=2)
    x = torch.cat((x, labels), dim=1)

    x = self.decoder(x)

    return x

In [12]:
class CVAE(nn.Module):
  def __init__(self, encoder_layer_dims, decoder_layer_dims, latent_dim, num_labels):
    super(CVAE, self).__init__()
    encoder_layer_dims[0] += num_labels
    decoder_input_dim = latent_dim + num_labels

    self.encoder = Encoder(encoder_layer_dims, latent_dim)
    self.decoder = Decoder(decoder_layer_dims, decoder_input_dim)

  def forward(self, x, labels):
    mean, log_var = self.encoder(x, labels)
    z = self.sampling(mean, log_var)
    output = self.decoder(z, labels)

    return mean, log_var, output

  def sampling(self, mean, log_var):
    batch_size, dim = mean.shape
    epsilon = torch.randn(batch_size, dim, device=mean.device)
    
    return mean + torch.exp(log_var/2) * epsilon

  def generate(self, z, labels):
    return self.decoder(z, labels)

In [13]:
def calc_loss(output, x, mean, log_var):
    reconstruction_loss = nn.functional.mse_loss(output, x, reduction='sum')
    kl_divergence_loss = -0.5 * torch.sum(1 + log_var - mean.pow(2) - torch.exp(log_var))
    loss = reconstruction_loss + kl_divergence_loss

    return (loss/x.size(0))

In [27]:
def train_cvae(cvae_model, cvae_optim, cvae_dataloader):
    print("Started training CVAE")
    
    for epoch in range(cvae_epochs):
      cvae_model.train()
      running_loss = 0.0
      count = 0
      for input, label in cvae_dataloader:
        input = input.to(device)
        label = label.to(device)
    
        mean, log_var, output = cvae_model(input, label)
    
        loss = calc_loss(output, input, mean, log_var)
        cvae_optim.zero_grad()
        loss.backward()
        cvae_optim.step()
    
        running_loss += loss.item()
        count += 1
    
      if epoch % 100 == 0 or epoch == cvae_epochs-1:
        print(f'Epoch: {epoch + 1}/{cvae_epochs}, Training Loss: {running_loss/count:.4f}')

    print("Completed Training CVAE")

<font size="6">Stage-3: Synthesis</font>

In [31]:
def generate_data(cvae_model, diff_num, text_tensor, labels_tensor):
    with torch.no_grad():
      label_1 = torch.ones(diff_num).long().to(device)
      z = torch.randn([diff_num, latent_dim]).to(device)
      generated = cvae_model.generate(z, label_1)
    
    final_text = np.array(torch.cat((text_tensor, generated), dim=0).to('cpu'))
    final_labels = np.array(torch.cat((labels_tensor, label_1), dim=0).to('cpu'))
    
    return final_text, final_labels

<font size="6">Stage-4: Prediction</font>

In [35]:
def predict(final_text, final_labels):
    warnings.filterwarnings("ignore")
    classifier = LogisticRegression()
    
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    pd_list, pf_list, g_measure_list = [], [], []
    
    for train_index,test_index in skfold.split(final_text, final_labels):
        x_train_fold = final_text[train_index]
        y_train_fold = final_labels[train_index]
        x_test_fold = final_text[test_index]
        y_test_fold = final_labels[test_index]
    
        classifier.fit(x_train_fold, y_train_fold)
        y_pred_lr = classifier.predict(x_test_fold)
    
        cm_lr = confusion_matrix(y_test_fold, y_pred_lr).ravel()
        pd_lr = cm_lr[3]/(cm_lr[3]+cm_lr[2])
        pf_lr = cm_lr[1]/(cm_lr[1]+cm_lr[0])
        g_measure_lr = (2*pd_lr*(1-pf_lr))/(pd_lr+(1-pf_lr))
        pd_list.append(pd_lr)
        pf_list.append(pf_lr)
        g_measure_list.append(g_measure_lr)
    
    print(f'g-measure: {round(np.average(g_measure_list),4)*100}')
    print(f'pd: {round(np.average(pd_list),4)*100}')
    print(f'pf: {round(np.average(pf_list),4)*100}')

<font size="6">Results</font>

In [36]:
def get_result(dataset):
    print(f'For {dataset} dataset:')
    
    #Stage-1: Text Representation
    text_tensor, labels_tensor = get_dataset(dataset, tokenizer, model)
    diff_num = int(torch.sum(labels_tensor == 0) - torch.sum(labels_tensor == 1))
    cvae_dataset = torch.utils.data.TensorDataset(text_tensor, labels_tensor)
    cvae_dataloader = torch.utils.data.DataLoader(cvae_dataset, batch_size=batch_size, shuffle=True)
    
    #Stage-2: CVAE Training
    cvae = CVAE(encoder_layer_dims = [input_dim, hidden_dim], 
                decoder_layer_dims = [hidden_dim, input_dim], 
                latent_dim=latent_dim, num_labels = 2)
    cvae.to(device)
    cvae_optim = torch.optim.Adam(cvae.parameters(), lr=lr)
    train_cvae(cvae, cvae_optim, cvae_dataloader)

    #Stage-3: Syntesize
    final_text, final_labels = generate_data(cvae, diff_num, text_tensor, labels_tensor)

    #Stage-4: Prediction
    predict(final_text, final_labels)

In [37]:
get_result(dataset="Ambari")

For Ambari dataset:
Started training CVAE
Epoch: 1/200, Training Loss: 143.0759
Epoch: 101/200, Training Loss: 46.9731
Epoch: 200/200, Training Loss: 46.9481
Completed Training CVAE
g-measure: 98.42999999999999
pd: 97.11999999999999
pf: 0.21


In [38]:
get_result(dataset="Camel")

For Camel dataset:
Started training CVAE
Epoch: 1/200, Training Loss: 139.6729
Epoch: 101/200, Training Loss: 43.7113
Epoch: 200/200, Training Loss: 43.6520
Completed Training CVAE
g-measure: 98.27
pd: 96.7
pf: 0.1


In [41]:
get_result(dataset="Chromium")

For Chromium dataset:
Started training CVAE
Epoch: 1/200, Training Loss: 43.2202
Epoch: 101/200, Training Loss: 40.1016
Epoch: 200/200, Training Loss: 40.1011
Completed Training CVAE
g-measure: 99.77000000000001
pd: 99.53999999999999
pf: 0.0


In [39]:
get_result(dataset="Derby")

For Derby dataset:
Started training CVAE
Epoch: 1/200, Training Loss: 139.1097
Epoch: 101/200, Training Loss: 43.6019
Epoch: 200/200, Training Loss: 43.5409
Completed Training CVAE
g-measure: 95.74000000000001
pd: 92.54
pf: 0.77


In [40]:
get_result(dataset="Wicket")

For Wicket dataset:
Started training CVAE
Epoch: 1/200, Training Loss: 156.0859
Epoch: 101/200, Training Loss: 43.6146
Epoch: 200/200, Training Loss: 43.5827
Completed Training CVAE
g-measure: 99.49
pd: 98.99
pf: 0.0
