# Outdated: See MLExperimenting_BERT for current version!
## ML experimentation
#### Current approach:
- Use CodeBERT to tokenize all lines
- Find the most occuring 250 tokens and save them as vocabulary
- Every dataset (training, test, validation) is vectorized based on the vocabulary
- Include matrix of line & prev_line as features
- MLPClassifier, not optimized yet


#### Open issues:
- Create embeddings with CodeBERT (?)
- Include next_line
- Include joern-features
- Hyperparameter tuning
- ...

_References_:
- https://www.analyticsvidhya.com/blog/2019/09/demystifying-bert-groundbreaking-nlp-framework/
- https://github.com/microsoft/CodeBERT

In [1]:
import pandas as pd
import numpy as np
import math
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
# Init CodeBERT
tokenizer_bert = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model_bert = AutoModel.from_pretrained("microsoft/codebert-base")

In [3]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df = pd.read_csv('./big-vul_dataset/line_sample_20p_balanced_ratio.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,func_id,line,vul,prev_line,next_line
0,0,179390,struct ext4_extent *ex2 = NULL;,1,struct ext4_extent *ex1 = NULL;,struct ext4_extent *ex3 = NULL;
1,1,185223,content::Source<content::WebContents>(contents...,1,"content::NOTIFICATION_WEB_CONTENTS_DISCONNECTED,",}
2,2,182615,case JPC_MS_SOP:,1,break;,case JPC_MS_EPH:
3,3,184460,"int64 bytes_per_sec,",1,void DownloadItemImpl::UpdateProgress(int64 by...,const std::string& hash_state) {
4,4,183701,true)) {,1,"print_preview_context_.node(),",Send(new PrintHostMsg_PrintPreviewInvalidPrint...


## Data preparation section

- Create the vocabulary for the matrix
- All datasets will have the contents of this vocabulary as columns 
- Create the matrix for train and test dataset
- TODO: next_line, other features

In [5]:
# Append all lines to an array for creating a vocabulary
all_functions = []
for i in range( 0, len(df['line'] )):
    all_functions.append(df['line'][i])

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# Create a count vectorizer on the whole dataset and include the most occuring 250 tokens, tokenizer takes CodeBERT as function
newVec = CountVectorizer(tokenizer = tokenizer_bert.tokenize,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features=750, \
                             lowercase=False,\
                             min_df=1
                             ) 
testVector = newVec.fit_transform(all_functions)
testVector

<16378x750 sparse matrix of type '<class 'numpy.int64'>'
	with 121722 stored elements in Compressed Sparse Row format>

In [8]:
# Create another CountVectorizer with the vocabulary from the CV above
# (ensures to have consistent token features for test, training, validation)
vectorizer = CountVectorizer(tokenizer = tokenizer_bert.tokenize,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             vocabulary=newVec.vocabulary_) 



In [9]:
# Learn the vocabulary dictionary and return document-term matrix.
train_data_features = vectorizer.fit_transform(all_functions)

b = pd.DataFrame(train_data_features.toarray(), columns=vectorizer.get_feature_names(), index= df.index)

train_data_features = train_data_features.toarray()

In [10]:
# Append all prev_lines to an array
all_lines_before = []
for i in range( 0, len(df['prev_line'] )):
    all_lines_before.append(df['prev_line'][i])

# same as CountVectorizer for main line
vectorizer = CountVectorizer(tokenizer = tokenizer_bert.tokenize,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             vocabulary=newVec.vocabulary_) 

# Learn the vocabulary dictionary and return document-term matrix.
prev_line_data_features = vectorizer.fit_transform(all_lines_before)

b_prev = pd.DataFrame(prev_line_data_features.toarray(), columns=vectorizer.get_feature_names(), index= df.index)

prev_line_data_features = prev_line_data_features.toarray()


In [11]:
# join vul dataset with main line matrix with prev_line matrix (ensure column uniqueness with rsuffix='_prev')
df_with_word_matrix = df[['vul']].join(b, rsuffix="actual_")
df_with_word_matrix = df_with_word_matrix.join(b_prev, rsuffix="_prev")
df_with_word_matrix.head()

Unnamed: 0,vul,!,"""",""")",""");",""",",""";",#,%,&,...,Ġurl_prev,Ġv_prev,Ġvalue_prev,Ġvoid_prev,Ġw_prev,Ġx_prev,Ġy_prev,Ġ{_prev,Ġ|_prev,Ġ||_prev
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## ML Model

- Using train_test_split for creating subsets of data
- Training MLPClassifier with training data
- Evaluating model with test data
- (Include RandomForestClassifier as one alternative option for model)


In [12]:
from sklearn.model_selection import train_test_split
# Create test and training sets as usual
#X, y = train_data_features, df['target']
X, y = df_with_word_matrix.loc[:, df_with_word_matrix.columns != 'vul'], df_with_word_matrix['vul']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(100,), learning_rate='constant',solver='adam').fit(X_train, y_train)
clf.score(X_test, y_test)

0.6800976800976801

In [14]:
print("Accuracy of prediction: " , clf.score(X_test, y_test))
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of prediction:  0.6935286935286935
              precision    recall  f1-score   support

           0       0.69      0.72      0.70       825
           1       0.70      0.67      0.68       813

    accuracy                           0.69      1638
   macro avg       0.69      0.69      0.69      1638
weighted avg       0.69      0.69      0.69      1638



## Hyperparameter tuning

In [17]:
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}


from sklearn.model_selection import GridSearchCV

gridclf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
gridclf.fit(X_train, y_train)


# Best parameter set
print('Best parameters found:\n', gridclf.best_params_)

# All results
means = gridclf.cv_results_['mean_test_score']
stds = gridclf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gridclf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))



Best parameters found:
 {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
0.643 (+/-0.013) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.628 (+/-0.007) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
0.639 (+/-0.003) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.622 (+/-0.016) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.641 (+/-0.006) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
0.624 (+/-0.009) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'



In [102]:
from sklearn.ensemble import RandomForestClassifier
rndclf = RandomForestClassifier(max_depth=2, random_state=0)
rndclf.fit(X_train, y_train)
rndclf.score(X_test, y_test)

y_pred = rndclf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.61      0.63      0.62       843
           1       0.59      0.58      0.58       795

    accuracy                           0.60      1638
   macro avg       0.60      0.60      0.60      1638
weighted avg       0.60      0.60      0.60      1638



## Validation section

- Read unbalanced dataset
- Create matrix with token as defined in the vocab at the beginning
- Evaluate model
- TODO: include prev_line in this section

In [88]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
val_df = pd.read_csv('./big-vul_dataset/line_sample_10p_original_ratio.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

# Append all functions to an array
all_functions_val = []
for i in range( 0, len(val_df['line'] )):
    all_functions_val.append(val_df['line'][i])



vectorizer_val = CountVectorizer(tokenizer = tokenizer_bert.tokenize,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             vocabulary=newVec.vocabulary_) 

val_data_features = vectorizer_val.fit_transform(all_functions_val)

b_val = pd.DataFrame(val_data_features.toarray(), columns=vectorizer_val.get_feature_names(), index= val_df.index)

val_data_features = val_data_features.toarray()

val_df_with_word_matrix = val_df[['vul']].join(b_val, rsuffix="actual")
val_df_with_word_matrix.head()





Unnamed: 0,vul,"""",""");",""",",#,%,&,(,"(""",(&,...,Ġsize,Ġsizeof,Ġthe,Ġto,Ġwe,Ġx,Ġy,Ġ{,Ġ|,Ġ||
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


In [89]:
X_val, y_val = val_df_with_word_matrix.loc[:, val_df_with_word_matrix.columns != 'vul'], val_df_with_word_matrix['vul']

print("Accuracy of prediction: " , clf.score(X_val, y_val))
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

Accuracy of prediction:  0.6584071111744483
              precision    recall  f1-score   support

           0       1.00      0.66      0.79    500599
           1       0.02      0.68      0.04      4634

    accuracy                           0.66    505233
   macro avg       0.51      0.67      0.41    505233
weighted avg       0.99      0.66      0.79    505233

