In [5]:
import pandas as pd
import numpy as np
import math
import torch
from tqdm import tqdm
import itertools
from function_encoders import FuncEncoderCountVectorizer, FuncEncoderTFIDFVectorizer

In [6]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df = pd.read_csv('./big-vul_dataset/functions_only_all.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,func_id,processed_func,target
0,0,0,static bool check_rodc_critical_attribute(stru...,0
1,1,1,static int samldb_add_entry(struct samldb_ctx ...,0
2,2,2,static int samldb_add_entry_callback(struct ld...,0
3,3,3,static int samldb_add_handle_msDS_IntId(struct...,0
4,4,4,static int samldb_add_step(struct samldb_ctx *...,0


In [9]:
encoder = FuncEncoderCountVectorizer(df_data=df, vocabulary_path="vocab/func_vocab.pkl")



In [10]:
from sklearn.model_selection import train_test_split

# split into training and validation sets
func_tr, func_test, y_tr, y_test = \
    train_test_split(df['processed_func'], df['target'], test_size=0.2, random_state=42)

In [11]:
# define how big a batch of entries should be (depending on RAM)
batch_size = 100

# number of epochs is calulated based on the batch_size
epochs = math.ceil(len(func_tr)/batch_size)

# split the dataframes (X_tr, y_tr) into an array of dataframes (number of epochs)
batchesFunc = np.array_split(func_tr, epochs)
batchesY = np.array_split(y_tr, epochs)

In [12]:
# import and initialisation of generic MLPClassifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(1500,750), learning_rate='adaptive',solver='adam', shuffle=True)

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    func_batch, Y_batch = batchesFunc[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(func_batch.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))

100%|██████████████████| 1510/1510 [10:45<00:00,  2.34it/s]


In [13]:
X_test_encoded = encoder.encode(func_test.tolist())

print("Accuracy of prediction: " , clf.score(X_test_encoded, y_test))
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

Accuracy of prediction:  0.9728318490245971
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     35508
           1       0.98      0.55      0.71      2220

    accuracy                           0.97     37728
   macro avg       0.97      0.78      0.85     37728
weighted avg       0.97      0.97      0.97     37728



In [15]:
from joblib import dump
# Store model
dump(clf, 'models/full_function.model')

['models/full_function.model']