## Function Vul Prediction
Evaluate how well our model can predict whether or not a function contains a vulnerable line.

In [1]:
import pandas as pd
import numpy as np
import math
import torch
from tqdm import tqdm
import itertools
from line_encoders import EncoderCountVectorizer, EncoderTFIDFVectorizer, EncoderBERTVectorConcat, EncoderBERTStringConcat

In [2]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df = pd.read_csv('./big-vul_dataset/validation_split/train_line_20p_balanced.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df_functions = pd.read_csv('./big-vul_dataset/validation_split/train_functions.csv', usecols=['func_id', 'target'], skipinitialspace=True, low_memory=True, keep_default_na=False)
df = pd.merge(df, df_functions, on='func_id', how='inner')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,func_id,line,vul,prev_line,next_line,target
0,0,185664,"static_cast<int>(url.spec().length()),",1,"return String::format(""\n<!-- saved from url=(...",url.spec().data());,1
1,1,180191,"send_sig(SIGKILL, current, 0);",1,WARN_ON(1);,return -1;,1
2,1216,180191,return -1;,1,"send_sig(SIGKILL, current, 0);",},1
3,1220,180191,int ret;,1,"unsigned int i, has_cntl = 0, too_long = 0;",/* strnlen_user includes the null we don't wan...,1
4,1818,180191,* any.,1,"* for strings that are too long, we should not...",*,1


In [4]:
### Use a standard vectorization approach by creating a feature matrix.
### Loses semantic and structural information, but it is FAST (>> anything BERT embedding related)
### Pre-configured to use 750 features 
encoder = EncoderCountVectorizer(df_data=df, vocabulary_path="vocab/vec_vocabulary.pkl")
#encoder = EncoderTFIDFVectorizer(df_data=df)
#encoder = EncoderCountVectorizer(df_data=df)

### Most precise since the embeddings of all 3 features are preserved (i.e. generates a 2304-element vector)
### On average 2x slower than EncoderBERTStringConcat
### Set avg_embeddings=False to take the first embedding vector, or True to average all embedding vectors

#encoder = EncoderBERTVectorConcat(avg_embeddings=False)

### Less precise since all 3 features are concatenated before embedding creation (i.e. generates a 768-element vector)
### On average 2x faster than EncoderBERTVectorConcat
### Set avg_embeddings=False to take the first embedding vector, or True to average all embedding vectors

#encoder = EncoderBERTStringConcat()



In [5]:
from sklearn.model_selection import train_test_split

# split into training and validation sets
line_tr, line_test, prev_line_tr, prev_line_test, next_line_tr, next_line_test, func_vul_tr, func_vul_test, y_tr, y_test = \
    train_test_split(df['line'], df['prev_line'], df['next_line'], df['target'], df['vul'], test_size=0.2, random_state=42)

In [6]:
# define how big a batch of entries should be (depending on RAM)
batch_size = 100

# number of epochs is calulated based on the batch_size
epochs = math.ceil(len(line_tr)/batch_size)

# split the dataframes (X_tr, y_tr) into an array of dataframes (number of epochs)
batchesLine = np.array_split(line_tr, epochs)
batchesPrevLine = np.array_split(prev_line_tr, epochs)
batchesNextLine = np.array_split(next_line_tr, epochs)
batchesFuncVul = np.array_split(func_vul_tr, epochs)
batchesY = np.array_split(y_tr, epochs)

## ML Model

- Using train_test_split for creating subsets of data
- Training MLPClassifier with training data
- Evaluating model with test data
- (Include RandomForestClassifier as one alternative option for model)


In [7]:
# import and initialisation of generic MLPClassifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(1500,750), learning_rate='adaptive',solver='adam', shuffle=True)

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, func_vul, Y_batch = \
        batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesFuncVul[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), \
                                  next_line_batch.tolist(), func_vul.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))

100%|████████████████████| 109/109 [01:27<00:00,  1.24it/s]


In [8]:
X_test_encoded = encoder.encode(line_test.tolist(), prev_line_test.tolist(), \
                                next_line_test.tolist(), func_vul_test.tolist())

print("Accuracy of prediction: " , clf.score(X_test_encoded, y_test))
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

Accuracy of prediction:  0.9204712812960235
              precision    recall  f1-score   support

           0       0.95      0.89      0.92      1357
           1       0.90      0.95      0.92      1359

    accuracy                           0.92      2716
   macro avg       0.92      0.92      0.92      2716
weighted avg       0.92      0.92      0.92      2716



## Model persistance
Selectively execute when needed!

In [9]:
from joblib import dump
# Store model
dump(clf, 'models/validation_split/mlp_Count_withFuncVul.model')

['models/validation_split/mlp_Count_withFuncVul.model']

In [None]:
from joblib import load
# Load model
clf = load('models/mlp_Count_withFuncVul.model') 

## Validation section

Evaluate on validation set.
Group results by function and see if function is predicted correctly.

In [19]:
# Load data for validation of the whole process

# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
val_df = pd.read_csv('./big-vul_dataset/validation_split/validation_line.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)
df_functions = pd.read_csv('./big-vul_dataset/validation_split/validation_functions.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)
val_df = pd.merge(val_df, df_functions, on='func_id', how='inner')
val_df = val_df.reset_index()
val_df = val_df.sample(frac=0.2)

In [20]:
from joblib import load
# Load function model
clf_func = load('models/validation_split/full_function.model')

In [23]:
from sklearn.metrics import classification_report

# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
val_df = pd.read_csv('./big-vul_dataset/validation_split/validation_line.csv', 
                     skipinitialspace=True, low_memory=True, keep_default_na=False)

df_functions = pd.read_csv('./big-vul_dataset/validation_split/validation_functions.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)
val_df = pd.merge(val_df, df_functions, on='func_id', how='inner')
val_df = val_df.reset_index()

# sample n random, but complete functions (i.e. all lines of that function)
sampled_func_ids = val_df['func_id'].sample(1000)
val_df = val_df.drop(val_df[~val_df.func_id.isin(sampled_func_ids)].index)

# predict all function vulnerabilities and join to df
from function_encoders import FuncEncoderCountVectorizer
encoder_func = FuncEncoderCountVectorizer(vocabulary_path="vocab/func_vocab.pkl")

df_functions_sampled = df_functions.drop(df_functions[~df_functions.func_id.isin(sampled_func_ids)].index)
func_vul_pred = clf_func.predict(encoder_func.encode(df_functions_sampled['processed_func'].tolist()))
df_functions_sampled['func_vul_pred'] = func_vul_pred

val_df = pd.merge(val_df, df_functions_sampled[['func_id', 'func_vul_pred']], on='func_id', how='inner')
val_df = val_df.reset_index()


line_val, prev_line_val, next_line_val, func_vul_val, y_val = val_df['line'], val_df['prev_line'], val_df['next_line'], val_df['func_vul_pred'], val_df['vul']

# define how big a batch of entries should be (depending on RAM)
batch_size = 100
# number of epochs is calulated based on the batch_size
epochs = math.ceil(len(line_val)/batch_size)

# split the dataframes (X_tr, y_tr) into an array of dataframes (number of epochs)
batchesLine = np.array_split(line_val, epochs)
batchesPrevLine = np.array_split(prev_line_val, epochs)
batchesNextLine = np.array_split(next_line_val, epochs)
batchesFuncVul = np.array_split(func_vul_val, epochs)
batchesY = np.array_split(y_val, epochs)

y_val_pred = []
# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, func_vul_batch, Y_batch = batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesFuncVul[i], batchesY[i]
    
    # encode to vector 
    X_val_encoded = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), next_line_batch.tolist(), func_vul_batch.tolist())
    y_val_pred = y_val_pred + clf.predict(X_val_encoded).tolist()

print(classification_report(y_val, y_val_pred))

100%|██████████████████| 1004/1004 [02:04<00:00,  8.07it/s]


              precision    recall  f1-score   support

           0       1.00      0.80      0.89     99993
           1       0.01      0.50      0.02       403

    accuracy                           0.80    100396
   macro avg       0.50      0.65      0.45    100396
weighted avg       0.99      0.80      0.88    100396



In [24]:
val_df['line_pred'] = y_val_pred

func_vul_pred = []
func_vul_act = []

for name, group in val_df.groupby('func_id'):
    func_vul_pred.append(group['line_pred'].max())
    func_vul_act.append(group['target'].max())
    
print(classification_report(func_vul_act, func_vul_pred))

              precision    recall  f1-score   support

           0       0.97      0.76      0.85       771
           1       0.32      0.84      0.46       106

    accuracy                           0.77       877
   macro avg       0.65      0.80      0.66       877
weighted avg       0.89      0.77      0.80       877

