In [1]:
import json
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import random

This takes a function with metadata as it is stored in the big vul dataset (the columns code and flaw_line_no are used) and outputs it as a dataframe with the columns 'originalIndex', 'line', 'vulnerable'. Empty lines and brackets only lines are removed. 

In [2]:
def functionToDF(data):
    return pd.DataFrame.from_records([(ind,line.strip(),line in data['flaw_line_no']) for ind, line in enumerate(data['code'].split('\n')) if line.strip() not in ['','{','}'] ], columns=['originalIndex', 'line', 'vulnerable'])

This function calculates the features. 

In [3]:
feature_length=15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("./model")
model = RobertaModel.from_pretrained("./model")
model.to(device)
def calculateFeatures(line, previousLines):
    code_tokens = tokenizer.tokenize(line)[:feature_length]
    source_tokens = [tokenizer.cls_token]
    for pLine in previousLines:
        source_tokens+=tokenizer.tokenize(pLine)[:feature_length]+[tokenizer.sep_token]
    source_tokens+=code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    context_embeddings=model(torch.tensor(source_ids)[None,:])[0]
    return context_embeddings.sum(dim=1)[0].detach().numpy()

Split the dataset into training and testing based on functions, so that all lines from a function are either entirely test or training

In [4]:
originalDataset=json.load(open("Big-Vul-dataset/data.json"))
originalDataset=random.sample(originalDataset, 10)#reduce dataset size, for testing only!
train, test = train_test_split(originalDataset, test_size=0.2, random_state=42)

In [5]:
previousLines=2
def dataSetToDataFrame(originalDataset):
    result=pd.DataFrame()
    for data in originalDataset:
        newLine=functionToDF(data)
        for i in range(1,previousLines+1):
            newLine['prevousLine'+str(i)] = newLine['line'].shift(periods=i)
        newLine['prevous']=newLine[['prevousLine'+str(i) for i in range(1,previousLines+1)]].apply(lambda row: list(row.values.astype(str)), axis=1)
        newLine['features']=newLine.apply(lambda row:calculateFeatures(row['line'],row['prevous']),axis=1)
        result = pd.concat([result,newLine.drop(columns=['prevousLine'+str(i) for i in range(1,previousLines+1)]) ])
    return result


In [6]:
train_pd=dataSetToDataFrame(train)
test_pd=dataSetToDataFrame(test)

In [7]:
rf = RandomForestClassifier(max_depth=50, n_estimators=15, max_features=5, random_state=42)
rf.fit(list(train_pd['features']),list(train_pd['vulnerable']))
rf.score(list(test_pd['features']),list(test_pd['vulnerable'])) 

1.0