In [1]:
# General imports
import os
import random
import math
import itertools
import pandas as pd
from tqdm import tqdm
import numpy as np
from datasets import load_metric

# pytorch imports
import torch
import pytorch_lightning as pl
import torchmetrics
from torch.utils.data import Dataset, DataLoader

# Transformer tokenizer imports
from transformers import BertTokenizerFast

# Transformers Bert model
from transformers import BertModel, BertForPreTraining, Trainer, TrainingArguments, EarlyStoppingCallback, BertConfig

MAX_SEQ_LEN = 512

`fused_weight_gradient_mlp_cuda` module not found. gradient accumulation fusion with weight gradient computation disabled.
2022-06-09 09:13:39.860317: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# GPU settings
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["NVIDIA_VISIBLE_DEVICES"] = "1"

import random
from datetime import datetime
random.seed(datetime.now())

In [3]:
def load_tokenizer(tokenizer_path):
    # load tokenizer from dict
    tokenizer =  BertTokenizerFast.from_pretrained(tokenizer_path)
    return tokenizer

In [4]:
class SimDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path, tokenizer):
        
        self.data_store = []
        df = pd.read_csv(dataset_path, sep="\t").fillna('')
        #self.samples = df.iloc[:, [2, 5, 6]]
        self.samples = df[["ot_func1", "ot_func2", "label"]]
        self.tokenizer = tokenizer
            
        self.__init_structures()
         
    def __init_structures(self):
        
        for first_asm, second_asm, label in tqdm(self.samples.values):
            first_asm_example = self.tokenizer(text=" ".join(first_asm.split(" NEXT_I ")), truncation=True, max_length=MAX_SEQ_LEN)
            second_asm_example = self.tokenizer(text=" ".join(second_asm.split(" NEXT_I ")), truncation=True, max_length=MAX_SEQ_LEN)
            
            example = {"first": first_asm_example,
                      "second": second_asm_example,
                      "label": label}
            
            self.data_store.append(example)
            
        random.shuffle(self.data_store)
                
    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]
    
    def save_to_file(self,save_file):
        torch.save(self.data_store, save_file)

In [5]:
class AsmDataModule(pl.LightningDataModule):

    def __init__(self, train_path, val_path, test_path, batch_size, tokenizer):
        
        super().__init__()
        
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.tokenizer = tokenizer
        
        self.batch_size = batch_size

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def setup(self, stage=None):
        
        if stage == 'fit':
            self.train_dataset = SimDataset(self.train_path, self.tokenizer,)
            self.val_dataset   = SimDataset(self.val_path, self.tokenizer)
            
        elif stage == 'test':
            self.test_dataset = SimDataset(self.test_path, self.tokenizer)
            
    def __pad(self, samples):
        
        all_tokens_ids = []
        all_masks = []
        
        max_tok_batch = max([len(block["input_ids"]) for block in samples])
        
        for block in samples:
             
            num_pad_inst = max_tok_batch - len(block["input_ids"])
            all_tokens_ids.append(block["input_ids"] + [self.tokenizer.pad_token_id] * num_pad_inst)
            all_masks.append(block["attention_mask"] + [0] * num_pad_inst)
            
        return {"input_ids": torch.tensor(all_tokens_ids, device="cuda"),
               "attention_mask": torch.tensor(all_masks, device="cuda")}
        

    def collate_with_padding(self, batch):
        
        first_p  = [elem["first"] for elem in batch]
        second_p = [elem["second"] for elem in batch]
        batch_labels = [elem["label"] if elem["label"] else -1 for elem in batch]
        
        first_p = self.__pad(first_p)
        second_p = self.__pad(second_p)
        
        batch_result = {"first": first_p,
                        "second": second_p,
                        "label": torch.tensor(batch_labels, device="cuda")}
        
        return batch_result

    def train_dataloader(self, *args, **kwargs):    
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size, 
                          collate_fn=self.collate_with_padding)

    def val_dataloader(self, *args, **kwargs):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size, 
                          collate_fn=self.collate_with_padding)

    def test_dataloader(self, *args, **kwargs):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size, 
                          collate_fn=self.collate_with_padding)

In [6]:
base_path = "../../"
prt_model = os.path.join(base_path, "models", "pretraining_model", "checkpoint-67246")
train_path = os.path.join(base_path, "dataset", "finetuning_dataset", "similarity", "functions", "train_functions_similarity_triplets.csv")
val_path = os.path.join(base_path, "dataset", "finetuning_dataset", "similarity", "functions", "val_functions_similarity_triplets.csv")
tokenizer_path = os.path.join(base_path, "tokenizer")

model_name =  f"BinBert_function_similarity"
output_model_path = os.path.join(base_path, "models", "finetuned_models", "similarity", "functions", model_name)

In [7]:
from_scratch = False

LEARNING_RATE = 0.00001
BATCH_SIZE = 16

NUM_TRAIN_EPOCHS = 20
PER_DEVICE_TRAIN_BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH_SIZE = 32
DATA_LOADER_NUM_WORKERS = 4
PATIENCE = 3

#models
BXSMAL="bert_xsmall"
BSMAL="bert_small"
BNORM="bert_normal"
BLARG="bert_larg"

MODEL=BNORM

if MODEL == BXSMAL:
    MAX_SEQ_LEN = 512
    MAX_POSITION_EMBEDDINGS = 514
    HIDDEN_SIZE = 128
    INTERMEDIATE_SIZE = 1024
    NUM_ATTENTION_HEADS = 8
    NUM_HIDDEN_LAYERS = 12
    TYPE_VOCAB_SIZE = 2

if MODEL == BSMAL:
    MAX_SEQ_LEN = 512
    MAX_POSITION_EMBEDDINGS = 514
    HIDDEN_SIZE = 512
    INTERMEDIATE_SIZE = 2048
    NUM_ATTENTION_HEADS = 8
    NUM_HIDDEN_LAYERS = 12
    TYPE_VOCAB_SIZE = 2
    
if MODEL == BNORM:
    MAX_SEQ_LEN = 512
    MAX_POSITION_EMBEDDINGS = 514
    HIDDEN_SIZE = 768
    INTERMEDIATE_SIZE = 3072
    NUM_ATTENTION_HEADS = 12
    NUM_HIDDEN_LAYERS = 12
    TYPE_VOCAB_SIZE = 2    
    
if MODEL == BLARG:
    MAX_SEQ_LEN = 512
    MAX_POSITION_EMBEDDINGS = 514
    HIDDEN_SIZE = 1024
    INTERMEDIATE_SIZE = 4096
    NUM_ATTENTION_HEADS = 16
    NUM_HIDDEN_LAYERS = 24
    TYPE_VOCAB_SIZE = 2

In [8]:
class SiameseFinenuting(pl.LightningModule):

    def __init__(self, model_path, batch_size, vocab=None):
        
        super().__init__()
        
        self.batch_size = batch_size
        
        # Model
        if from_scratch:
            print("From scratch")
            config = BertConfig(
                vocab_size = len(vocab),
                max_position_embeddings = MAX_POSITION_EMBEDDINGS,
                hidden_size = HIDDEN_SIZE,
                intermediate_size = INTERMEDIATE_SIZE,
                num_attention_heads = NUM_ATTENTION_HEADS,
                num_hidden_layers = NUM_HIDDEN_LAYERS,
                type_vocab_size = TYPE_VOCAB_SIZE,
                output_hidden_states=True
            )
            self.model = BertModel(config=config)
        else:
            self.model = BertModel.from_pretrained(model_path, output_hidden_states=True)
        
        self.cosine = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

        # Criterion
        # self.loss = torch.nn.MSELoss()
        self.loss = torch.nn.CosineEmbeddingLoss()
        
        # metrics
        self.train_auc = torchmetrics.AUROC()
        self.val_auc = torchmetrics.AUROC()
        # metrics = MetricCollection([Accuracy(), Precision(), Recall()])
        # self.train_metrics = metrics.clone(prefix='train_')
        # self.valid_metrics = metrics.clone(prefix='val_')
        
    def forward(self, pairs_asm_input):
        
        first = pairs_asm_input["first"]
        second = pairs_asm_input["second"]
        labels = pairs_asm_input["label"]
        
        # take hidden state of the CLS token in the last layer as embedding

        #first_embeddings = self.model(**first).hidden_states[-1][:,0]
        #second_embeddings = self.model(**second).hidden_states[-1][:,0]
        
        first_output = self.model(**first)
        second_output = self.model(**second)
        
        first_hidden_states = first_output.hidden_states[-1]
        second_hidden_states = second_output.hidden_states[-1]

        first_masks = first['attention_mask']
        second_masks = second['attention_mask']

        first_partial_mul = first_hidden_states * first_masks.unsqueeze(-1)
        second_partial_mul = second_hidden_states * second_masks.unsqueeze(-1)
        
        first_partial_sum = torch.sum(first_partial_mul, dim=1)
        second_partial_sum = torch.sum(second_partial_mul, dim=1)
        
        first_n = torch.sum(first_masks, dim=1)
        second_n = torch.sum(second_masks, dim=1)
        
        first_embeddings = first_partial_sum / first_n.unsqueeze(-1)
        second_embeddings = second_partial_sum / second_n.unsqueeze(-1)

        cosines = self.cosine(first_embeddings, second_embeddings)
        
        result = dict()
        result['prediction'] = cosines
        result['labels'] = labels
        
        result['first_embedding'] = first_embeddings
        result['second_embedding'] = second_embeddings
        
        return result

    def training_step(self, batch, batch_idx):
        forward_output = self.forward(batch)
        
        prediction = forward_output["prediction"]
        labels = forward_output["labels"]
        
        # loss = self.loss(prediction, labels.float())
        loss = self.loss(forward_output['first_embedding'], forward_output['second_embedding'], labels)
        
        l2 = labels.clone()
        l2[l2==-1]=0
        # self.train_auc.update(prediction, l2)
        m = self.train_auc(prediction, l2)
         
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=self.batch_size)
        self.log('train_auc', m, on_step=False, on_epoch=True, prog_bar=True, logger=True, batch_size=self.batch_size)
        
        return {"loss":loss,
                "train_auc":m}

    def validation_step(self, batch, batch_idx):
        forward_output = self.forward(batch)
        
        prediction = forward_output["prediction"]
        labels = forward_output["labels"]
        
        # loss = self.loss(prediction, labels.float())
        loss = self.loss(forward_output['first_embedding'], forward_output['second_embedding'], labels)
        
        l2 = labels.clone()
        l2[l2==-1]=0
        # self.val_auc.update(prediction, l2)
        m = self.val_auc(prediction, l2)
    
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=self.batch_size)
        self.log('val_auc', m, on_step=False, on_epoch=True, prog_bar=True, logger=True, batch_size=self.batch_size)
        
        return {"loss":loss,
                "val_auc":m}

    def test_step(self, batch, batch_idx):
        forward_output = self.forward(batch)
        
        prediction = forward_output["prediction"]
        labels = forward_output["labels"]
        
        # loss = self.loss(prediction, labels)
        
        loss = self.loss(forward_output['first_embedding'], forward_output['second_embedding'], labels)
        # metrics = self.train_metrics(prediction, labels)
        metrics = dict()
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=self. batch_size)
        # self.log("train_Accuracy", metrics["train_Accuracy"], on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=batch_size)
        metrics["loss"] = loss
        
        return metrics

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        return optimizer

In [9]:
def get_trainer(ckpt_dir):

    early_stopping = pl.callbacks.EarlyStopping(
        # nitor='val_loss',
        monitor='val_auc',
        patience=3,
        verbose=True,
        mode='max', # wheter we want to maximize (max) or minimize the "monitor" value.
    )

    check_point_callback = pl.callbacks.ModelCheckpoint(
        # nitor='val_loss',
        monitor='val_auc',
        verbose=True,
        save_top_k=1,
        mode='max', # wheter we want to maximize (max) or minimize the "monitor" value.
        dirpath=ckpt_dir,
        filename='{epoch}-{val_auc:.4f}',
        # save_weights_only = True
    )


    # the PyTorch Lightning Trainer
    trainer = pl.Trainer(
        max_epochs=NUM_TRAIN_EPOCHS,
        gpus=1,
        progress_bar_refresh_rate=5,
        # callbacks=[early_stopping, check_point_callback]
        callbacks=[check_point_callback]
    )

    return trainer

In [10]:
tokenizer = load_tokenizer(tokenizer_path)
data_module = AsmDataModule(train_path, val_path, None, BATCH_SIZE, tokenizer)

In [11]:
print(prt_model)
simaese_model = SiameseFinenuting(prt_model, BATCH_SIZE, vocab=tokenizer.vocab if from_scratch else None)

/home/jovyan/work/olivetree/final_for_paper/models/next_sentence_prediction_bert_normal_mask30/checkpoint-67246


Some weights of the model checkpoint at /home/jovyan/work/olivetree/final_for_paper/models/next_sentence_prediction_bert_normal_mask30/checkpoint-67246 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
trainer = get_trainer(ckpt_dir=output_model_path)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [13]:
trainer.fit(model=simaese_model, datamodule=data_module)

100%|██████████| 18804/18804 [00:27<00:00, 674.99it/s]
100%|██████████| 1674/1674 [00:02<00:00, 709.14it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name      | Type                | Params
--------------------------------------------------
0 | model     | BertModel           | 92.0 M
1 | cosine    | CosineSimilarity    | 0     
2 | loss      | CosineEmbeddingLoss | 0     
3 | train_auc | AUROC               | 0     
4 | val_auc   | AUROC               | 0     
--------------------------------------------------
92.0 M    Trainable params
0         Non-trainable params
92.0 M    Total params
368.176   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1175: val_auc reached 0.97734 (best 0.97734), saving model to "/home/jovyan/work/olivetree/final_for_paper/tests/similarity/functions/fine_tuned_models/olivetree/nsp_normal_mask30_avg_cos_emb_loss/epoch=0-val_auc=0.9773.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2351: val_auc reached 0.98136 (best 0.98136), saving model to "/home/jovyan/work/olivetree/final_for_paper/tests/similarity/functions/fine_tuned_models/olivetree/nsp_normal_mask30_avg_cos_emb_loss/epoch=1-val_auc=0.9814.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 3527: val_auc reached 0.98239 (best 0.98239), saving model to "/home/jovyan/work/olivetree/final_for_paper/tests/similarity/functions/fine_tuned_models/olivetree/nsp_normal_mask30_avg_cos_emb_loss/epoch=2-val_auc=0.9824.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 4703: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 5879: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 7055: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 8231: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 9407: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 10583: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 11759: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 12935: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 14111: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 15287: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 16463: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 17639: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 18815: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 19991: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 21167: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 22343: val_auc was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 23519: val_auc was not in top 1


In [14]:
##To convert Model Output
config = BertConfig(
            vocab_size = len(tokenizer.vocab),
            max_position_embeddings = MAX_POSITION_EMBEDDINGS,
            hidden_size = HIDDEN_SIZE,
            intermediate_size = INTERMEDIATE_SIZE,
            num_attention_heads = NUM_ATTENTION_HEADS,
            num_hidden_layers = NUM_HIDDEN_LAYERS,
            type_vocab_size = TYPE_VOCAB_SIZE)
model = BertModel(config=config)

In [15]:
model_torch = torch.load(os.path.join(output_model_path, "epoch=2-val_auc=0.9824.ckpt"))

In [16]:
new_dict = dict()
for k in model_torch["state_dict"]:
    new_dict[k.replace("model.","")] = model_torch["state_dict"][k]
del model_torch

In [17]:
model.load_state_dict(new_dict)

<All keys matched successfully>

In [18]:
model.save_pretrained(os.path.join(output_model_path, "epoch-2"))