In [26]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json
import os.path
from os import path
from joblib import dump, load

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.ensemble import RandomForestClassifier

import nltk

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

Import JSON from dataset file that is output by the C# application. The C# preprocessing application creates several different datasets with different types of extracted features. Each is stored in a different JSON file. Swapping out the file name here allows the notebook to be executed against different datasets.
Then randomize the order of the data. The original SARD dataset had certain vulnerability types grouped together. Which had adverse affects on the training and testing.
Print out 5 examples from the dataset.

In [27]:
with open('..\Output\ExternalArgumentsDatasetWithSource.json') as json_file:
    data = np.array(json.load(json_file))
    shuffle = np.random.permutation(np.arange(len(data)))
    data = data[shuffle]
    for test_case_index in range (0, 5):
        print('Class Name:', data[test_case_index]['ClassName'])
        print('Is Flawed:', data[test_case_index]['IsFlawed'])
        print(type(data[test_case_index]['IsFlawed']))
        print('Features:', data[test_case_index]['Features'])
        print('-----------------------------------------------')

Class Name: CWE089
Is Flawed: True
<class 'bool'>
Features: 


using System;
using System.Text;
using System.Data;
using System.Data.SQLite;

namespace default_namespace{
    class MainClass41141{
        public static void Main(string[] args){
            string tainted_2 = null;
string tainted_3 = null;

            
                tainted_2 = args[1];
            
tainted_3 = tainted_2;
            
                if((Math.Sqrt(42)<=42)){
                    
                StringBuilder escape = new StringBuilder();
                for (int i = 0; i < tainted_2.Length; ++i){
                    char current = tainted_2[i];
                    switch (current){
                        case '\\':
                            escape.Append(@"\5c");
                            break;
                        case '*':
                            escape.Append(@"\2a");
                            break;
                        case '(':
                            escape.Append(@"\28")

Convert the feature names to numeric features indices. Omit CWEs that only have a small number of examples.

In [28]:
print(len(data))
categories = ['No Flaw', 'CWE022', 'CWE078', 'CWE089', 'CWE090', 'CWE091']
labels = []
examples = []
examples_skipped = 0

for test_case_index in range(0, len(data)):
    if data[test_case_index]['IsFlawed']:
        try:
            category_index = categories.index(data[test_case_index]['ClassName'])
            labels.append(category_index)
            examples.append(data[test_case_index]['Features']) 
        except ValueError:
            examples_skipped = examples_skipped + 1
    else:
        labels.append(0)
        examples.append(data[test_case_index]['Features'])
        
print('Label Count:', len(labels))
print('Example Count:', len(examples))
print('Examples Skipped:', examples_skipped)

30980
Label Count: 30970
Example Count: 30970
Examples Skipped: 10


Print 5 examples to verify.

In [29]:
for test_case_index in range (0, 5):
    print('Label:', labels[test_case_index])
    print('Category:', categories[labels[test_case_index]])
    print('Example:', examples[test_case_index])

Label: 3
Category: CWE089
Example: 


using System;
using System.Text;
using System.Data;
using System.Data.SQLite;

namespace default_namespace{
    class MainClass41141{
        public static void Main(string[] args){
            string tainted_2 = null;
string tainted_3 = null;

            
                tainted_2 = args[1];
            
tainted_3 = tainted_2;
            
                if((Math.Sqrt(42)<=42)){
                    
                StringBuilder escape = new StringBuilder();
                for (int i = 0; i < tainted_2.Length; ++i){
                    char current = tainted_2[i];
                    switch (current){
                        case '\\':
                            escape.Append(@"\5c");
                            break;
                        case '*':
                            escape.Append(@"\2a");
                            break;
                        case '(':
                            escape.Append(@"\28");
                       

Split the data into train and test sets using a 80/20 split.

In [30]:
num_test = int(len(examples) * 0.2)
print("num_test:", num_test)
train_data, train_labels = examples[num_test:], labels[num_test:]
dev_data, dev_labels = examples[:num_test], labels[:num_test]
print(len(dev_data))
print(len(train_data))

num_test: 6194
6194
24776


Run the code samples through a CountVectorizer with a custom token_pattern. The pattern allows for periods, parenthesis and commas to be included in a "word". The purpose of this is to allow method signatures to be considered as a single word instead of split into parts on the punctuation. For example, "Console.WriteLine(string,string)" would be treated as a single word.
Train K Nearest Neighbors, Multinomial Naive Bayes, Random Forest, Logistic Regression (with TfidfVectorizer) and Logistic Regression models using GridSearchCV to optimize the hyper parameters for each.
Print out a confusion matrix and ten examples from the most confused pair of classes. This is what helped us the most in tuning the preprocessor application. Each time we ran this the printed examples would have similarities that highlighted the mistakes the model was making and suggested ways to improve the preprocessing to help mitigate those mistakes.

In [31]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
feature_vectors = vectorizer.fit_transform(train_data)
vocabulary = vectorizer.get_feature_names()
dev_vectorizer = CountVectorizer(vocabulary=vocabulary,token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
dev_feature_vectors = dev_vectorizer.fit_transform(dev_data)

'''
# K Nearest Neighbors
n_neighbors = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=n_neighbors, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("K Nearest Neighbors")
print("Optimal value for k:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

# Multinomial Naive Bayes
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0] }
grid_search = GridSearchCV(MultinomialNB(), param_grid=alphas, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("Multinomial Naive Bayes")
print("Optimal value for alpha:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

# Random Forest
print("Random Forest")
random_forest_model = RandomForestClassifier(n_estimators = 100)
random_forest_model.fit(feature_vectors, train_labels)
predicted_labels = random_forest_model.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()'''

c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 1000.0]
c = {'C': c_values }

if path.exists('model.pkl'):
    model = load('model.pkl')
else:
    # Logistic Regression (with TfidfVectorizer)
    tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
    tfidf_feature_vectors = tfidf_vectorizer.fit_transform(train_data)
    tfidf_vocabulary = tfidf_vectorizer.get_feature_names()
    tfidf_dev_vectorizer = TfidfVectorizer(vocabulary=tfidf_vocabulary,token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
    tfidf_dev_feature_vectors = tfidf_dev_vectorizer.fit_transform(dev_data)
    grid_search = GridSearchCV(LogisticRegression(), param_grid=c, cv=5)
    grid_search.fit(tfidf_feature_vectors, train_labels)
    #print("Logistic Regression (with TfidfVectorizer)")
    print("Optimal value for C:", grid_search.best_params_)
    model = grid_search.best_estimator_
    dump(grid_search.best_estimator_, 'model.pkl')
    #predicted_labels = grid_search.best_estimator_.predict(tfidf_dev_feature_vectors)
    #print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
    #print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
    #print()

print("Logistic Regression (saved model)")
#print("Optimal value for C:", grid_search.best_params_)
predicted_labels = model.predict(tfidf_dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

'''logistic_regression = LogisticRegression(c=100)
logistic_regression.fit(tfidf_feature_vectors, train_labels)
dump(logistic_regression, open('model.pkl', 'wb'))
model = load(open('model.pkl', 'rb'))
print("Logistic Regression (saved model)")
#print("Optimal value for C:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(tfidf_dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

# Logistic Regression
grid_search = GridSearchCV(LogisticRegression(), param_grid=c, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("Logistic Regression")
print("Optimal value for C:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()
'''

matrix = confusion_matrix(dev_labels, predicted_labels)
print(matrix)
max_errors = 0
most_confused_actual = -1
most_confused_predicted = -1
# Find the most confused digit pair
for actual in range(0, len(categories)):
    for predicted in range(0, len(categories)):
        if (actual != predicted and matrix[actual, predicted] > max_errors):
            max_errors = matrix[actual, predicted]
            most_confused_actual = actual
            most_confused_predicted = predicted
print ("Most confused pair is actual =", categories[most_confused_actual], ", predicted =", categories[most_confused_predicted])
print ("This error occurred", max_errors, "times")
# Print out examples of the confused digits
error_pair_count = 0
for index in range(0, len(dev_labels)):
    if (dev_labels[index] == most_confused_actual and predicted_labels[index] == most_confused_predicted and error_pair_count < 10):
        print(dev_data[index])
        print("-----------------------------")
        error_pair_count = error_pair_count + 1

Logistic Regression (saved model)
F1 score: 0.9980630743010547
Accuracy:  0.9980626412657411

[[2577    0    1    2    0    3]
 [   1  258    0    0    0    0]
 [   0    0  253    0    0    0]
 [   5    0    0 2381    0    0]
 [   0    0    0    0  252    0]
 [   0    0    0    0    0  461]]
Most confused pair is actual = CWE089 , predicted = No Flaw
This error occurred 5 times



using System;
using MySql.Data.MySqlClient;
using System.Text.RegularExpressions;

namespace default_namespace{
    class MainClass31406{
        public static void Main(string[] args){
            string tainted_2 = null;
string tainted_3 = null;

            
                tainted_2 = Console.ReadLine();
            
tainted_3 = tainted_2;
            
                switch(6){
                    case(6):
                        break;
                    default:
                        
                string pattern = @"/^[0-9]*$/";
                Regex r = new Regex(pattern);
                Match 

Print the most highly weighted words for each class. This helped us in determining which words were impacting the model without actually affecting the functionality of the code. For example, the naming of a variables in a lot of the examples can give hints as to what type of vulnerability is contained in it. In real life samples you wouldn't have this. So in practice we wouldn't want the model to rely on variable names.

In [32]:
def build_table(vectorizer):
    feature_vectors = vectorizer.fit_transform(train_data)
    c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    c = {'C': c_values }
    grid_search = GridSearchCV(LogisticRegression(), param_grid=c, cv=5)
    grid_search.fit(feature_vectors, train_labels)
    max_values = [None]*len(grid_search.best_estimator_.coef_)
    for class_index in range(0, len(grid_search.best_estimator_.coef_)):
        top_5 = np.argsort(grid_search.best_estimator_.coef_[class_index])[-5:]
        max_values[class_index] = top_5
    feature_names = vectorizer.get_feature_names()
    #print('{:<18}'.format(newsgroups_train.target_names[class_index]), '{:<20}'.format(weight), '{:<11}'.format(feature_names[feature_index])) 
    for class_index in range(0, len(grid_search.best_estimator_.coef_)):
        for feature_index in max_values[class_index]:
            weight = grid_search.best_estimator_.coef_[class_index][feature_index]
            print('{:<19}'.format(categories[class_index]), end = '')
            print('{:<20}'.format(feature_names[feature_index]), end = '')
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[0][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[1][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[2][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[3][feature_index])) 
            
build_table(CountVectorizer(token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None))



No Flaw            mainclass55639       4.8016 -0.0000 -0.0007 -0.0067
No Flaw            mainclass25106       4.8338 -0.0002 -0.6973 -0.0171
No Flaw            mainclass51199       5.1944 -0.0001 -0.0005 -0.0078
No Flaw            haspathtraversalregex 9.7049 -9.8482 -0.0926 -0.0071
No Flaw            hardcoded            11.4980 -3.7021 -8.1048 -9.6068
CWE022             file.exists(tainted_3 0.3486  1.7835 -1.9477 -0.7621
CWE022             args                -7.1529  2.2066  4.4897  4.7987
CWE022             string.format(string,object)isexternal(false,false-1.0633  2.2186 -0.8524  0.2226
CWE022             system.text.regularexpressions.regex.escape(string)isexternal(false-2.6426  2.2240 -0.8495  0.2141
CWE022             system.io.file.exists(string)isexternal(true 0.9482  5.6598 -2.1057 -0.7714
CWE078             mainclass24738      -0.0822 -0.0002  2.3344 -0.0219
CWE078             system.console.readline()isexternal-3.5695  1.0811  2.5318  2.5288
CWE078             console.re

Attempt to remove some of the bias from variable names via the use of a custom preprocessor method. This resulted in a reduced accuracy in our baseline dataset as expected. A more optimal solution (not implemented here) would be the replace all the variable names with more generic names when parsing the code and preprocessing.

In [33]:
def better_preprocessor(s):
#Looks for the pattern 'Vul' followed up 0-4 digits and replaces it with the word "blahVar"
    new_s = re.sub('vul\d{0,4}','blahVar', s)
#Looks for the pattern 'tainted' and replaces it with "blahVar"
    new_s = re.sub('tainted', 'blahVar', new_s)
#Looks for the word 'class' and takes the following word and changes it to "blahVar"
    new_s = re.sub('class \s*(\S+)', 'blahVar', new_s)
#Looks for the word 'placeholder' and changes it to 'blahVar'
    new_s = re.sub('placeholder', 'blahVar', new_s)
#Looks for the word 'checked_data' and changes it to 'blahvar'
    new_s = re.sub('checked_data', 'blahVar', new_s)
#Looks for the word 'hardcoded' and changes it to 'blahvar'
    new_s = re.sub('hardcoded', 'blahVar', new_s)    
    return new_s

vectorizer = CountVectorizer(preprocessor = better_preprocessor)
feature_vectors = vectorizer.fit_transform(train_data)
vocabulary = vectorizer.get_feature_names()
dev_vectorizer = CountVectorizer(preprocessor = better_preprocessor, vocabulary=vocabulary)
dev_feature_vectors = dev_vectorizer.fit_transform(dev_data)

# K Nearest Neighbors
n_neighbors = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=n_neighbors, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("K Nearest Neighbors")
print("Optimal value for k:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

K Nearest Neighbors
Optimal value for k: {'n_neighbors': 5}
F1 score: 0.9857480356256412
Accuracy:  0.9857927026154343

