In [None]:
!pip install tokenizers
!pip install --upgrade gensim



In [None]:
!pip install numpy



In [None]:
import math
import sys
import time
import datetime
import copy

from gensim.models import KeyedVectors, Word2Vec

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from datetime import datetime as dt
from openpyxl import load_workbook
from openpyxl.styles import Font

from tokenizers import Tokenizer

In [None]:
excel_name = 'results'

In [None]:
torch.cuda.is_available()

False

In [None]:
PAD_IDX = 0
UNK_IDX = 1

global word2index
global embedding

In [None]:
def pad_sents(sents, pad_idx):
    """
    Pad the sentences with respect to max length sentence.
    """
    max_len = max([len(sent) for sent in sents])
    padded_sents = []
    for sent in sents:
        if len(sent) < max_len:
            sent = sent + [pad_idx] * (max_len - len(sent))

        padded_sents.append(sent)

    return padded_sents


def word2indices(sents, word2index):
    """
    Convert list of tokenized sentences into the list of indices. 
    """
    return [[word2index[token] if token in word2index else UNK_IDX for token in sent] for sent in sents]


#     return [[get_token_idx(word2index, token, UNK_IDX) for token in sent] for sent in sents]

def indices2word(sents, index2word):
    """
    Convert list of token id's into the list of sentences.
    """
    return [[index2word[token] for token in sent] for sent in sents]


def to_tensor(sents, pad_idx=0, device=torch.device("cuda")):
    """
    Convert list of sents into list of indices.
    """
    sent_indices = word2indices(sents, word2index)
    padded_sents = pad_sents(sent_indices, pad_idx)
    sent_tensor = torch.tensor(padded_sents, dtype=torch.long, device=device)
    return sent_tensor  # (batch_size, max_seq_len)


def generate_sent_masks(sents, lengths, device):
    """
    Generate the padding masking for given sents from lenghts. 
    Assumes lengths are sorted by descending order.
    """
    max_len = lengths[0]
    bs = sents.shape[0]
    mask = torch.arange(max_len).expand(bs, max_len) < lengths.unsqueeze(1)
    return ~mask.bool().to(device)


def batch_iter(data, bs, shuffle=False):
    """
    Yields batches of sentences reverse sorted by length (longest to smallest)
    """

    batch_num = math.ceil(len(data) / bs)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * bs: (i + 1) * bs]
        samples = [data[idx] for idx in indices]
        samples = sorted(samples, key=lambda sample: len(sample[0]), reverse=True)
        sents = [sample[0] for sample in samples]
        labels = [sample[1] for sample in samples]
        yield sents, labels


def load_wv(filename, fasttext=True, limit=None):
    if fasttext:
        binary = True if filename.endswith('.bin') else False
        print(filename, "binary: ", binary)
        w2v = KeyedVectors.load_word2vec_format(filename,
                                                binary=binary,
                                                limit=limit,
                                                unicode_errors='ignore')
    else:
        w2v = Word2Vec.load(filename)

    return w2v


def masked_pooling(hidden_states, masks):
    """
    Generate the pooled output.
    hidden_states: Hidden state output of the RNN-like model.
    masks: Sent masks for the padding. 
    """
    avg_pool = hidden_states.masked_fill(masks[:, :, None], 0).mean(dim=1)
    avg_pool *= hidden_states.size(1) / (hidden_states.size(1) - masks.type(avg_pool.dtype).sum(dim=1))[:, None]
    max_pool = hidden_states.masked_fill(masks[:, :, None], -float('inf')).max(dim=1)[0]

    return max_pool, avg_pool


def bn_drop_linear(n_in, n_out, bn=True, p=0.0, activation=None):
    """
    Sequence of BatchNorm if bn=True, Dropout if p>0 and Linear(n_in, n_out) layers followed by activation.
    """

    layers = [nn.BatchNorm1d(n_in)] if bn else None
    if p != 0: layers.append(nn.Dropout(p))
    layers.append(nn.Linear(n_in, n_out))
    if activation is not None: layers.append(activation)
    return layers


In [None]:
class ClassifierModel(nn.Module):

    def __init__(self, embeddings, hidden_size, out_size, num_layers=1,
                 bidirectional=True, dropout_p=0.1, linears=None, drops=None,
                 activations=None, pad_idx=0, device="cpu"):
        super(ClassifierModel, self).__init__()

        self.device = device
        self.embeddings = embeddings
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout_p = dropout_p
        self.pad_idx = pad_idx

        self.dropout = nn.Dropout(p=dropout_p)
        self.lstm = nn.LSTM(embedding.embedding_dim, hidden_size,
                            num_layers=num_layers, bidirectional=bidirectional,
                            batch_first=True)

        if bidirectional: hidden_size *= 2
        if linears is None: linears = [64]

        class_layers = []
        linears = [hidden_size * 3] + linears + [out_size]  # hidden_size*3 bcs of concat pooling.
        if drops is None: drops = [0.3] * (len(linears) - 1)
        if activations is None: activations = [nn.ReLU(inplace=True)] * (len(linears) - 2) + [None]

        if len(linears) - 1 != len(drops):
            raise ValueError("Number of layers and dropout values don't match.")

        for n_in, n_out, p, actn in zip(linears[:-1], linears[1:], drops, activations):
            class_layers += bn_drop_linear(n_in, n_out, p=p, activation=actn)

        self.classifier = nn.Sequential(*class_layers)

    def forward(self, sents):
        sent_lens = torch.tensor([len(sent) for sent in sents])
        sents_tensor = to_tensor(sents, device=self.device)  # (max_seq_len, bs)
        sent_masks = generate_sent_masks(sents_tensor, sent_lens, device=self.device)
        
        X = self.embeddings(sents_tensor)
        X = self.dropout(X)
        X = pack_padded_sequence(X, sent_lens, batch_first=True)
        
        hidden_outputs, (h_n, c_n) = self.lstm(X)  # (h_n, c_n)
        hidden_outputs, _ = pad_packed_sequence(hidden_outputs, batch_first=True)

        max_pool, avg_pool = masked_pooling(hidden_outputs, sent_masks)
        h_n = torch.cat((h_n[-2], h_n[-1], max_pool, avg_pool), dim=1)

        X = self.dropout(h_n)
        X = self.classifier(X)
        return X

    def save(self, path):

        params = {
            "args": dict(hidden_size=self.hidden_size,
                         out_size=self.out_size,
                         num_layers=self.num_layers,
                         bidirectional=self.bidirectional,
                         dropout_p=self.dropout_p,
                         pad_idx=self.pad_idx),
            "embeddings": self.embeddings,
            "state_dict": self.state_dict(),
        }
        torch.save(params, path)

    @staticmethod
    def load(path):

        params = torch.load(path, map_location=lambda storage, loc: storage)
        args = params["args"]
        embeddings = params["embeddings"]
        model = ClassifierModel(embeddings, **args)
        model.load_state_dict(params["state_dict"])
        return model


In [None]:

def generate_prediction_mask(predictions, threshold):
    mask = np.zeros_like(predictions)
    mask[predictions > threshold] = 1

    other_sum = np.sum(mask, axis=1)
    other_mask = mask[:, 29]
    other_zero_mask = np.logical_and(other_mask == 1, other_sum > 1)
    mask[other_zero_mask, 29] = 0
    return mask.astype(np.uint8)


def calculate_f1(y_true, y_pred):
    macro_f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return macro_f1


def f1_threshold(y_true, logits):
    thresholds = np.linspace(0, 1, 21)
    f1_scores = []
    for threshold in thresholds:
        mask = generate_prediction_mask(logits, threshold=threshold)
        macro_f1 = metrics.f1_score(y_true, mask, average='macro')
        f1_scores.append(round(macro_f1, 4))

    max_f1_idx = np.argmax(f1_scores)
    max_f1_threshold = thresholds[max_f1_idx]
    max_f1_score = f1_scores[max_f1_idx]

    print("Best threshold/score pair: {}/{}".format(max_f1_threshold, max_f1_score))
    print(f1_scores)
    return max_f1_score


def train_step(model, loss_fn, optimizer, data, scheduler=None, bs=32):
    total_loss = 0.0
    model.train()
    start_time = time.time()
    total_batch = math.ceil(len(data) / bs)

    for step, batch in enumerate(batch_iter(data, bs, shuffle=True)):

        if step % 500 == 0 and not step == 0:
            elapsed_since = time.time() - start_time
            print("Batch {}/{}\tElapsed since: {}".format(step, total_batch,
                                                          str(datetime.timedelta(seconds=round(elapsed_since)))))

        sents, labels = batch
        labels = torch.from_numpy(np.vstack(labels)).float()
        optimizer.zero_grad()
        logits = model(sents)
        train_loss = loss_fn(logits.cpu(), labels)
        total_loss += train_loss.item()
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

    avg_train_loss = total_loss / total_batch
    return avg_train_loss


def eval_step(model, loss_fn, data, bs=32):
    total_loss = 0.0
    model.eval()
    total_batch = math.ceil(len(data) / bs)
    all_labels, all_logits = [], []
    with torch.no_grad():
      for batch in batch_iter(data, bs):
          sents, labels = batch
          labels = torch.from_numpy(np.vstack(labels)).float()
          logits = model(sents)
          eval_loss = loss_fn(logits.cpu(), labels)
          total_loss += eval_loss.item()

          logits = torch.sigmoid(logits)
          all_labels.extend(labels.numpy().tolist())
          all_logits.extend(logits.cpu().detach().numpy().tolist())

    average_eval_loss = total_loss / total_batch
    metrics = get_metrics(all_labels, all_logits)
    return average_eval_loss, metrics


def get_metrics(y_true, y_preds):
    precision = metrics.precision_score(y_true, np.round(y_preds), average='macro')
    recall = metrics.recall_score(y_true, np.round(y_preds), average='macro')
    
    return {
        "f1": metrics.f1_score(y_true, np.round(y_preds), average="macro"),
        "f2": (5 * precision * recall) / (4 * precision + recall),
        "roc_auc": metrics.roc_auc_score(y_true, y_preds),
        "precision": precision,
        "recall": recall,
        "accuracy_score": metrics.accuracy_score(y_true, np.round(y_preds)),
        "classification_report": metrics.classification_report(y_true, np.round(y_preds), labels=[0,1])
    }


In [None]:
global excel_name
def train(model, loss_fn, optimizer, train_data, valid_data=None, scheduler=None, n_epochs=5, bs=32, model_kwargs=None):
    train_losses = []
    hist_valid_scores = []
    patience = 0
    total_patience = 5
    num_trial = 0
    max_num_trial = 5
    lr_decay = 0.5
    model_save_path = model_path = f"drive/MyDrive/code-security/assets/models/{dt.now().strftime('%Y-%m-%d-%H-%M-%S')}_{n_epochs}_model.bin"
    valid_niter = int((len(train_data) / bs)/2)
    best_model = None
    best_f1_model = None
    best_f1_all = 0.0

    valid_losses = []

    for epoch in range(n_epochs):

        start_time = time.time()

        total_loss = 0.0
        model.train()
        start_time = time.time()
        total_batch = math.ceil(len(train_data) / bs)

        for step, batch in enumerate(batch_iter(train_data, bs, shuffle=True), start=1):

            if step % 500 == 0 and not step == 0:
                elapsed_since = time.time() - start_time
                print("Batch {}/{}\tElapsed since: {}".format(step, total_batch,
                                                              str(datetime.timedelta(seconds=round(elapsed_since)))))
            sents, labels = batch
            labels = torch.from_numpy(np.vstack(labels)).float()
            optimizer.zero_grad()
            logits = model(sents)
            train_loss = loss_fn(logits.cpu(), labels)
            total_loss += train_loss.item()
            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            if step % valid_niter == 0:  # 

                if valid_data is not None:
                    result_excel = load_workbook("drive/MyDrive/code-security/assets/results/binary/binary_classification_results_"+str(excel_name)+".xlsx")
                    result_sheet = result_excel.active
                    excel_sheet_index = result_sheet.max_row + 1
                    result_sheet["A" + str(excel_sheet_index)] = str(model_kwargs) + " \n " + "bs : " + str(bs) + " \n" + "epoch : " + str(n_epochs)
                    result_sheet["A" + str(excel_sheet_index)].font = Font(bold=True)

                    valid_loss, valid_metrics = eval_step(model, loss_fn, valid_data, bs=128)
                    valid_losses.append(valid_loss)

                    result_sheet["A" + str(excel_sheet_index + 1)] = "Valid_loss"
                    result_sheet["B" + str(excel_sheet_index + 1)] = str(valid_loss)
                    excel_sheet_index += 1

                    for metric, score in valid_metrics.items():

                        result_sheet["A" + str(excel_sheet_index)] = metric
                        result_sheet["B" + str(excel_sheet_index)] = score

                        excel_sheet_index += 1

                    result_excel.save("drive/MyDrive/code-security/assets/results/binary/binary_classification_results_"+str(excel_name)+".xlsx")

                    is_better = len(hist_valid_scores) == 0 or valid_loss < min(hist_valid_scores)
                    hist_valid_scores.append(valid_loss)
                    if valid_metrics["f1"] > best_f1_all:
                        best_f1_all = valid_metrics["f1"]
                        best_f1_model = copy.deepcopy(model)

                    if is_better:
                        patience = 0
                        model.save(model_save_path)
                        torch.save(optimizer.state_dict(), model_save_path + '.optim')

                    elif patience < total_patience:
                        patience += 1
                        print("Hit patience %d" % patience, file=sys.stderr)

                        if patience == total_patience:
                            num_trial += 1
                            print("Hit #%d trial" % num_trial, file=sys.stderr)

                            if num_trial == max_num_trial:
                                print("EARLY STOP!", file=sys.stderr)
                                return train_losses, valid_losses if valid_data is not None else []

                            lr = optimizer.param_groups[0]['lr'] * lr_decay

                            for param_group in optimizer.param_groups:
                                param_group["lr"] = lr

                            patience = 0

                    model.train()

            if scheduler is not None:
                scheduler.step()

      
        train_loss = total_loss / total_batch
        train_losses.append(train_loss)

        elapsed_time = time.time() - start_time
        print("Epoch {}/{} is done. Took: {} Loss: {:.5f}".format(epoch + 1,
                                                                  n_epochs,
                                                                  str(datetime.timedelta(seconds=round(elapsed_time))),
                                                                  train_loss))

    return best_f1_model, train_losses, valid_losses if valid_data is not None else []


In [None]:
def main():

    DEBUG = False
    tokenizer = Tokenizer.from_file("drive/MyDrive/code-security/datasets/word_embedding_model/notmarked/bpe_30k_no_mark_all_types.json")

    train_data = pd.read_csv("drive/MyDrive/code-security/datasets/binary_classification/train/notmarked/notmarked_sard_vdisc_train.csv")
    valid_data = pd.read_csv("drive/MyDrive/code-security/datasets/binary_classification/validation/notmarked_sard_and_vdisc_validate.csv")

    if DEBUG:
        train_data = train_data.sample(1000, random_state=42)
        valid_data = valid_data.sample(1000, random_state=42)

    
    global word2index
    global embedding

    train_sents = tokenizer.encode_batch(train_data["code"])
    train_sents = [item.tokens for item in train_sents]

    train_labels = train_data["Label"].values

    valid_sents = tokenizer.encode_batch(valid_data["code"])
    valid_sents = [item.tokens for item in valid_sents]

    valid_labels = valid_data["Label"].values

    wv_fn = "drive/MyDrive/code-security/datasets/word_embedding_model/notmarked/all_data_git_no_mark_sg_e_10_vec_300.model"  # Vec file word2vec
    w2v_model = load_wv(wv_fn, fasttext=False)

    additional_vectors = np.zeros(shape=(2, 300))
    index2word = ["<pad>", "<unk>"] + w2v_model.wv.index_to_key
    word2index = {word: index for index, word in enumerate(index2word)}
    weights = np.concatenate((additional_vectors, w2v_model.wv.vectors))

    weights = torch.from_numpy(weights).float()
    embedding = nn.Embedding.from_pretrained(weights, padding_idx=0)

    train_samples = list(zip(train_sents, train_labels))
    valid_samples = list(zip(valid_sents, valid_labels))

    HIDDEN_SIZE = 128
    NUM_LAYERS = 2
    BIDIRECTIONAL = True
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_kwargs = {
        "hidden_size": HIDDEN_SIZE,
        "out_size": 1,
        "dropout_p": 0.3,
        "num_layers": NUM_LAYERS,
        "bidirectional": BIDIRECTIONAL,
        #     "linears": [64],
        #     "drops":,
        #     "drops": [0.3],
        "device": device
    }

    model = ClassifierModel(embedding, **model_kwargs)
    model.to(device)

    uniform_init = 0.1
    print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
    for p in model.parameters():
        p.data.uniform_(-uniform_init, uniform_init)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()

    best_f1_model, train_losses, valid_losses = train(model, criterion, optimizer, train_samples,
                                                      valid_data=valid_samples, scheduler=None,
                                                      n_epochs=5, bs=32, model_kwargs=model_kwargs)

    if best_f1_model != None:
      best_f1_model.save(f"drive/MyDrive/code-security/assets/best_f1_models/{dt.now().strftime('%Y-%m-%d-%H-%M-%S')}_best_f1_model.bin")
    

In [None]:
import random
import os 
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()
main()

Label counts for the training data:
0    950214
1    112301
Name: Label, dtype: int64
Label counts for the validation data:
0    118868
1     14982
Name: Label, dtype: int64


uniformly initialize parameters [-0.100000, +0.100000]


...Training for 5 epochs...
Number of training samples:  1062515
Batch 500/33204	Elapsed since: 1:01:25
Batch 1000/33204	Elapsed since: 1:55:22
Batch 1500/33204	Elapsed since: 2:49:33
Batch 2000/33204	Elapsed since: 3:41:23
Batch 2500/33204	Elapsed since: 4:36:34
Batch 3000/33204	Elapsed since: 5:28:55
Batch 3500/33204	Elapsed since: 6:22:12
Batch 4000/33204	Elapsed since: 7:14:36
Batch 4500/33204	Elapsed since: 8:18:56
Batch 5000/33204	Elapsed since: 9:28:34
Batch 5500/33204	Elapsed since: 10:40:12


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# result_excel = load_workbook("drive/MyDrive/code-security/assets/results/binary/binary_classification_results.xlsx")
# result_sheet = result_excel.active
# excel_sheet_index = result_sheet.max_row + 1
# print(excel_sheet_index)
# # result_sheet["A" + str(excel_sheet_index)] =  "dsfsdf"
# # result_excel.save("drive/MyDrive/code-security/assets/results/binary/binary_classification_results.xlsx")

# last_empty_row = len(list(result_sheet.rows))
# last_empty_row
# len(result_sheet['A'])

In [None]:
valid_data[valid_data['CWE-469'] == True]