In [1]:
import pandas as pd
import numpy as np
import math
import torch
from tqdm import tqdm
import itertools
from function_encoders import FuncEncoderCountVectorizer, FuncEncoderTFIDFVectorizer

In [2]:
# DO NOT forget 'keep_default_na=False' --> otherwise some NaN values in read data
df = pd.read_csv('./big-vul_dataset/validation_split/train_functions.csv', skipinitialspace=True, low_memory=True, keep_default_na=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,func_id,processed_func,target
0,0,184401,void PrintPreviewDataSource::StartDataRequest(...,1
1,1,182535,"ipt_do_table(struct sk_buff *skb,\nconst struc...",1
2,2,180996,"rpki_rtr_pdu_print (netdissect_options *ndo, c...",1
3,3,182120,void edge_sparse_csr_reader_double( const char...,1
4,4,181852,static MagickBooleanType WriteTXTImage(const I...,1


In [4]:
encoder = FuncEncoderCountVectorizer(df_data=df, vocabulary_path="vocab/func_vocab.pkl")



In [5]:
from sklearn.model_selection import train_test_split

# split into training and validation sets
func_tr, func_test, y_tr, y_test = \
    train_test_split(df['processed_func'], df['target'], test_size=0.2, random_state=42)

In [6]:
# define how big a batch of entries should be (depending on RAM)
batch_size = 100

# number of epochs is calulated based on the batch_size
epochs = math.ceil(len(func_tr)/batch_size)

# split the dataframes (X_tr, y_tr) into an array of dataframes (number of epochs)
batchesFunc = np.array_split(func_tr, epochs)
batchesY = np.array_split(y_tr, epochs)

In [7]:
# import and initialisation of generic MLPClassifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(1500,750), learning_rate='adaptive',solver='adam', shuffle=True)

# iterate over the number of epochs
for i in tqdm(range(epochs)):
    # take a batch and process it and partial_fit the model to the batch
    func_batch, Y_batch = batchesFunc[i], batchesY[i]
    
    # encode to vector 
    encodedBatch = encoder.encode(func_batch.tolist())
    
    clf.partial_fit(encodedBatch, Y_batch, classes=np.unique(y_tr))

  0%|                             | 0/1359 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2816 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████| 1359/1359 [09:56<00:00,  2.28it/s]


In [8]:
X_test_encoded = encoder.encode(func_test.tolist())

print("Accuracy of prediction: " , clf.score(X_test_encoded, y_test))
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

Accuracy of prediction:  0.9734648799882197
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     32007
           1       0.94      0.58      0.71      1948

    accuracy                           0.97     33955
   macro avg       0.96      0.79      0.85     33955
weighted avg       0.97      0.97      0.97     33955



In [9]:
from joblib import dump
# Store model
dump(clf, 'models/validation_split/full_function.model')

['models/validation_split/full_function.model']