In [1]:
import json
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import random
import os

# preprocessing functions for both datasets

This function calculates the features. 

In [2]:
feature_length=15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("./model")
model = RobertaModel.from_pretrained("./model")
model.to(device)
def calculateFeatures(line, previousLines):
    code_tokens = tokenizer.tokenize(line)[:feature_length]
    source_tokens = [tokenizer.cls_token]
    for pLine in previousLines:
        source_tokens+=tokenizer.tokenize(pLine)[:feature_length]+[tokenizer.sep_token]
    source_tokens+=code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    context_embeddings=model(torch.tensor(source_ids)[None,:])[0]
    return context_embeddings.sum(dim=1)[0].detach().numpy()

This takes a function with metadata as it is extracted from either the big vul dataset or the ILm vul dataset (the columns code and flaw_line_no are used) and outputs it as a dataframe with the columns 'originalIndex', 'line', 'vulnerable'. Empty lines and brackets only lines are removed. 

In [3]:
def functionToDF(data):
    return pd.DataFrame.from_records([(ind,line.strip(),line in data['flaw_line_no']) for ind, line in enumerate(data['code'].split('\n')) if line.strip() not in ['','{','}','};'] ], columns=['originalIndex', 'line', 'vulnerable'])

This function prepares the original dataset with entire methods stored in one line into a dataframe with individual lines in of code per line. Additionally, it is possible to define the size of the context used as the number of previous lines. These are stored in a list in reverse order. 

In [4]:
previousLines=2
def dataSetToDataFrame(originalDataset):
    result=pd.DataFrame()
    for index,data in originalDataset.iterrows():
        newLine=functionToDF(data)
        for i in range(1,previousLines+1):
            newLine['prevousLine'+str(i)] = newLine['line'].shift(periods=i)
        newLine['prevous']=newLine[['prevousLine'+str(i) for i in range(1,previousLines+1)]].apply(lambda row: list(row.values.astype(str)), axis=1)
        newLine['features']=newLine.apply(lambda row:calculateFeatures(row['line'],row['prevous']),axis=1)
        result = pd.concat([result,newLine.drop(columns=['prevousLine'+str(i) for i in range(1,previousLines+1)]) ])
    return result


# Big-Vul dataset

In [5]:
df_bigVul=json.load(open("Big-Vul-dataset/data.json"))
df_bigVul=random.sample(df_bigVul, 10)#reduce dataset size, for testing only!
df_bigVul=pd.DataFrame(df_bigVul)
df_bigVul.drop(columns=['vul','bigvul_id'])#remove columns that are not needed

Unnamed: 0,code,flaw_line_no
0,exsltCryptoCryptoApiRc4Encrypt (xmlXPathParser...,[]
1,static int __init lz4_mod_init(void)\n{\n ret...,[]
2,void PrintRenderFrameHelper::PrintPageInternal...,[]
3,static int do_recv_XMotionEvent(rpc_message_t ...,[]
4,static void free_vpid(struct vcpu_vmx *vmx)\n{...,[]
5,scoped_refptr<VertexAttribManager> CreateVer...,[]
6,"static void codeApplyAffinity(Parse *pParse, i...",[]
7,static void index_entry_adjust_namemask(\n ...,[]
8,MagickExport Image *ColorMatrixImage(const Ima...,[]
9,int SSLClientSocketOpenSSL::Connect(const Comp...,[]


# Ilm-vul dataset

This code adds the code of the original method. The dataset has additional files with partial transformations. Maybe, they are more useful for us, feel free to modify this.

In [6]:
df_Ilm=pd.DataFrame(os.listdir("llm-vul-main\llm-vul-main\VJBench-trans"),columns=['project'])
df_Ilm['code']=df_Ilm.apply(lambda x: open(os.path.join("llm-vul-main\llm-vul-main\VJBench-trans",x['project']+"\\"+x['project']+"_original_method.java"),'r').read(), axis=1)

This adds the location of the bug in the original method. It is a list with 2 elements (start and end line); usually the same.

In [7]:
df_Ilm['location_original_method']=df_Ilm.apply(lambda x:json.load(open(os.path.join("llm-vul-main\llm-vul-main\VJBench-trans",x['project']+"\\"+"buggyline_location.json")))['original'],axis=1)

In [8]:
df_Ilm['flaw_line_no']=df_Ilm.apply(lambda row:list(range(row['location_original_method'][0][0],row['location_original_method'][0][1]+1)),axis=1)  #convert the beginning and end location to a list containing all vulnerable lines. Assumption: there is only one vulnerable location.
df_Ilm=df_Ilm.drop(columns=['location_original_method','project' ])#delete the column with start and end of the vulnerable location as this is no longer needed as well as the project column
df_Ilm=df_Ilm.sample( 10)#reduce dataset size, for testing only!
df_Ilm

Unnamed: 0,code,flaw_line_no
36,private void writeSession(SessionInformations ...,[31]
34,"protected XmlFactory(ObjectCodec oc, int xpFea...",[12]
25,@Converter\npublic SAXSource toSAXSourceFromSt...,[24]
46,"private int readStored(final byte[] buffer, fi...",[16]
38,public Calendar ceil(Calendar cal) {\n Cale...,[21]
15,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...",[49]
11,protected void internalGetMessageById(AsyncRes...,[11]
4,"@Exported(inline=true)\npublic Map<String,Obje...","[4, 5]"
31,"public static void writeEntry(ZipFile zipFile,...",[5]
40,"void read(Tokeniser t, CharacterReader r) {\n ...",[27]


# data preprocessing for both datasets

combine both datasets into one as they should have the same structure now. This dataset now contains rows with the entire functions and the line location of the vulnerable line(s). Please note that the data format for flaw_line_no is slightly different.

In [9]:
df_complete=pd.concat([df_bigVul,df_Ilm])

Split the dataset into training and testing based on functions, so that all lines from a function are either entirely test or training

In [10]:
train, test = train_test_split(df_complete, test_size=0.2, random_state=42)

In [11]:
train=dataSetToDataFrame(train)
test=dataSetToDataFrame(test)

# classification

In [12]:
rf = RandomForestClassifier(max_depth=50, n_estimators=15, max_features=5, random_state=42)
rf.fit(list(train['features']),list(train['vulnerable']))
rf.score(list(test['features']),list(test['vulnerable'])) 

1.0