## ML experimentation
#### Current approach:
- Selection of 3 different encoders (TokenVectorizer, 2x BERT embeddings)
- All encoders use CodeBERT as a tokenizer
- line, prev_line & next_line are considered
- MLPClassifier, not optimized yet


#### Open issues:
- Try different ML models
- Include joern-features (maybe, but unlikely due to resource constraints)
- Hyperparameter tuning
- ...

_References_:
- https://www.analyticsvidhya.com/blog/2019/09/demystifying-bert-groundbreaking-nlp-framework/
- https://github.com/microsoft/CodeBERT

In [1]:
import pandas as pd
import numpy as np
import math
import torch
from tqdm import tqdm
import itertools
from line_encoders import EncoderCountVectorizer, EncoderTFIDFVectorizer, EncoderBERTVectorConcat, EncoderBERTStringConcat

In [2]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df = pd.read_csv('./big-vul_dataset/line_sample_20p_balanced_ratio.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,func_id,line,vul,prev_line,next_line
0,0,178766,if (chg < 0),1,},return chg;
1,1,185074,current_surface_in_use_by_compositor_));,1,(pending_thumbnail_tasks_ > 0 ||,if (current_surface_is_protected_ == surface_i...
2,2,188514,"unsigned int duration,",1,"vpx_codec_pts_t pts,","vpx_enc_frame_flags_t flags,"
3,3,186756,if (!process_),1,Response StorageHandler::UntrackIndexedDBForOr...,return Response::InternalError();
4,4,179932,if (newIndex < 256),1,},newName[newIndex++] = curr;


## Data preparation section

- Selection of 3 different data encoders
- Create train and test dataset
- Split into batches

### Please select _encoder_ below!
- `encoder = EncoderCountVectorizer(df_data=df, vocabulary_path="vec_vocabulary.pkl")`
- `encoder = EncoderBERTVectorConcat()`
- `encoder = EncoderBERTStringConcat()`

In [4]:
### Use a standard vectorization approach by creating a feature matrix.
### Loses semantic and structural information, but it is FAST (>> anything BERT embedding related)
### Pre-configured to use 750 features 
#encoder = EncoderCountVectorizer(df_data=df, vocabulary_path="vec_vocabulary.pkl")
encoder = EncoderTFIDFVectorizer(df_data=df)
#encoder = EncoderCountVectorizer(df_data=df)

### Most precise since the embeddings of all 3 features are preserved (i.e. generates a 2304-element vector)
### On average 2x slower than EncoderBERTStringConcat
### Set avg_embeddings=False to take the first embedding vector, or True to average all embedding vectors

# encoder = EncoderBERTVectorConcat(avg_embeddings=False)

### Less precise since all 3 features are concatenated before embedding creation (i.e. generates a 768-element vector)
### On average 2x faster than EncoderBERTVectorConcat
### Set avg_embeddings=False to take the first embedding vector, or True to average all embedding vectors

# encoder = EncoderBERTStringConcat()

In [5]:
test = encoder.encode(["int x = 5"], ["function(){"], ["}"])
test

[array([-2.02038868e-01,  1.02507968e-01,  1.29144633e-01,  1.96664541e-01,
         1.03299924e-01, -6.88584115e-02,  5.60134805e-02,  7.16314624e-02,
         2.39443908e-01,  5.35633195e-02,  1.65110024e-01,  4.49429303e-01,
        -1.53622427e-01, -1.15898972e-01,  4.64199726e-01, -8.41948475e-02,
         2.16541046e-01,  1.72740182e-01,  1.41299536e-01,  1.36996885e-01,
        -1.27642305e-01, -2.17121951e-01,  3.37922602e-01, -1.23560483e-01,
         3.88066750e-01,  1.92392749e-01,  3.84311536e-01,  6.19902150e-01,
        -1.07817858e-01,  7.95015557e-01,  5.51156056e-03, -5.89093254e-02,
         1.79620206e+00, -2.38495642e-01,  8.05727284e-02, -1.13388557e-01,
        -1.09933202e-02,  2.10654281e-01,  1.92522894e-02, -8.93912757e-02,
        -3.80665236e-01,  4.33045082e-01, -1.19474924e+00,  9.57832431e-02,
         5.10663135e-01,  3.54345424e-02,  4.34898917e-02,  1.45371649e-01,
        -1.06443661e-02,  1.84740761e-01,  1.36541328e-01,  3.05287991e-01,
        -6.4

In [7]:
from sklearn.model_selection import train_test_split

# split into training and validation sets
line_tr, line_test, prev_line_tr, prev_line_test, next_line_tr, next_line_test, y_tr, y_test = \
    train_test_split(df['line'], df['prev_line'], df['next_line'], df['vul'], test_size=0.2, random_state=42)

In [8]:
# define how big a batch of entries should be (depending on RAM)
batch_size = 100

# number of epochs is calulated based on the batch_size
epochs = math.ceil(len(line_tr)/batch_size)

# split the dataframes (X_tr, y_tr) into an array of dataframes (number of epochs)
batchesLine = np.array_split(line_tr, epochs)
batchesPrevLine = np.array_split(prev_line_tr, epochs)
batchesNextLine = np.array_split(next_line_tr, epochs)
batchesY = np.array_split(y_tr, epochs)

## ML Model

- Using train_test_split for creating subsets of data
- Training MLPClassifier with training data
- Evaluating model with test data
- (Include RandomForestClassifier as one alternative option for model)


In [9]:
# import and initialisation of generic MLPClassifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(1500,750), learning_rate='adaptive',solver='adam', shuffle=True)

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, Y_batch = batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), next_line_batch.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))

100%|███████████████████████████████████████████████| 120/120 [21:37<00:00, 10.81s/it]


In [None]:
# RandomForestClassifier (Currently WIP)
# Does not provide a partial_fit() method, therefore workaround by increasing tree count each epoch

TREE_INCREASE_EACH_EPOCH = 10

# import and initialisation of generic MLPClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, warm_start=True)

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, Y_batch = batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), next_line_batch.tolist())
    
    clf.fit(encodedBatch, Y_batch)
    # increase tree count each epoch
    clf.set_params(n_estimators=len(clf.estimators_)+TREE_INCREASE_EACH_EPOCH)

In [None]:
# Naive Bayes approach

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, Y_batch = batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), next_line_batch.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))



In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    line_batch, prev_line_batch, next_line_batch, Y_batch = batchesLine[i], batchesPrevLine[i], batchesNextLine[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(line_batch.tolist(), prev_line_batch.tolist(), next_line_batch.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))



In [10]:
X_test_encoded = encoder.encode(line_test.tolist(), prev_line_test.tolist(), next_line_test.tolist())

print("Accuracy of prediction: " , clf.score(X_test_encoded, y_test))
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

Accuracy of prediction:  0.6332110703567856
              precision    recall  f1-score   support

           0       0.62      0.71      0.66      1523
           1       0.65      0.55      0.60      1476

    accuracy                           0.63      2999
   macro avg       0.64      0.63      0.63      2999
weighted avg       0.64      0.63      0.63      2999



## Model persistance
Selectively execute when needed!

In [None]:
from joblib import dump
# Store model
dump(clf, 'models/mlp_TFIDFVectorizer.model')

In [None]:
from joblib import load
# Load model
clf = load('mlp.model') 

## Hyperparameter tuning

<span style="color:red">Currently not _really_ supported due to missing batch support. <br>i.e. only works for small data samples and _EncoderCountVectorizer_</span> 

In [None]:
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

# TODO: add batch support to allow for larger data end even alternative encoders

encoded_X_tr = encoder.encode(line_tr.tolist(), prev_line_tr.tolist(), next_line_tr.tolist())

from sklearn.model_selection import GridSearchCV

gridclf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
gridclf.fit(encoded_X_tr, y_tr)


# Best parameter set
print('Best parameters found:\n', gridclf.best_params_)

# All results
means = gridclf.cv_results_['mean_test_score']
stds = gridclf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gridclf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))



In [None]:
from sklearn.ensemble import RandomForestClassifier
rndclf = RandomForestClassifier(max_depth=2, random_state=0)
rndclf.fit(X_train, y_train)
rndclf.score(X_test, y_test)

y_pred = rndclf.predict(X_test)
print(classification_report(y_test, y_pred))



## Validation section

- Read unbalanced dataset
- Encode using selected encoder
- Evaluate model

In [None]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
val_df = pd.read_csv('./big-vul_dataset/line_sample_1p_original_ratio.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

In [None]:
line_val, prev_line_val, next_line_val, y_val = val_df['line'], val_df['prev_line'], val_df['next_line'], val_df['vul']

X_val_encoded = encoder.encode(line_val.tolist(), prev_line_val.tolist(), next_line_val.tolist())
print("Accuracy of prediction: " , clf.score(X_val_encoded, y_val))
y_val_pred = clf.predict(X_val_encoded)
print(classification_report(y_val, y_val_pred))

In [None]:
for input, prediction, label in zip(line_val, y_val_pred, y_val):
  if prediction != label:
    print(input, 'has been classified as ', prediction, 'and should be ', label)