In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

Import JSON from dataset file that is output by the C# application.

In [2]:
with open('..\Output\ExternalArgumentsDatasetWithoutSource.json') as json_file:
    data = np.array(json.load(json_file))
    shuffle = np.random.permutation(np.arange(len(data)))
    data = data[shuffle]
    for test_case_index in range (0, 5):
        print('Class Name:', data[test_case_index]['ClassName'])
        print('Is Flawed:', data[test_case_index]['IsFlawed'])
        print(type(data[test_case_index]['IsFlawed']))
        print('Features:', data[test_case_index]['Features'])
        print('-----------------------------------------------')

Class Name: CWE089
Is Flawed: True
<class 'bool'>
Features: string.String(char[])IsExternal(False)
System.IO.Path.GetInvalidFileNameChars()IsExternal()
string.String(char[])IsExternal(False)
System.IO.Path.GetInvalidPathChars()IsExternal()
System.Text.RegularExpressions.Regex.Regex(string)IsExternal(False)
string.Format(string,object)IsExternal(False,False)
System.Text.RegularExpressions.Regex.Escape(string)IsExternal(False)
System.Text.RegularExpressions.Regex.Replace(string,string)IsExternal(True,False)
System.Math.Pow(double,double)IsExternal(False,False)
System.Data.OracleClient.OracleConnection.OracleConnection(string)IsExternal(False)
System.Data.OracleClient.OracleConnection.Open()IsExternal()
System.Data.OracleClient.OracleConnection.CreateCommand()IsExternal()
System.Data.OracleClient.OracleCommand.ExecuteReader()IsExternal()
System.Data.OracleClient.OracleDataReader.Read()IsExternal()
System.Console.WriteLine(string)IsExternal(False)
object.ToString()IsExternal

In [3]:
print(len(data))
categories = ['No Flaw', 'CWE022', 'CWE078', 'CWE089', 'CWE090', 'CWE091']
labels = []
examples = []
examples_skipped = 0

for test_case_index in range(0, len(data)):
    if data[test_case_index]['IsFlawed']:
        try:
            category_index = categories.index(data[test_case_index]['ClassName'])
            labels.append(category_index)
            examples.append(data[test_case_index]['Features']) 
        except ValueError:
            examples_skipped = examples_skipped + 1
    else:
        labels.append(0)
        examples.append(data[test_case_index]['Features'])
        
print('Label Count:', len(labels))
print('Example Count:', len(examples))
print('Examples Skipped:', examples_skipped)

30980
Label Count: 30970
Example Count: 30970
Examples Skipped: 10


In [4]:
for test_case_index in range (0, 5):
    print('Label:', labels[test_case_index])
    print('Category:', categories[labels[test_case_index]])
    print('Example:', examples[test_case_index])

Label: 3
Category: CWE089
Example: string.String(char[])IsExternal(False)
System.IO.Path.GetInvalidFileNameChars()IsExternal()
string.String(char[])IsExternal(False)
System.IO.Path.GetInvalidPathChars()IsExternal()
System.Text.RegularExpressions.Regex.Regex(string)IsExternal(False)
string.Format(string,object)IsExternal(False,False)
System.Text.RegularExpressions.Regex.Escape(string)IsExternal(False)
System.Text.RegularExpressions.Regex.Replace(string,string)IsExternal(True,False)
System.Math.Pow(double,double)IsExternal(False,False)
System.Data.OracleClient.OracleConnection.OracleConnection(string)IsExternal(False)
System.Data.OracleClient.OracleConnection.Open()IsExternal()
System.Data.OracleClient.OracleConnection.CreateCommand()IsExternal()
System.Data.OracleClient.OracleCommand.ExecuteReader()IsExternal()
System.Data.OracleClient.OracleDataReader.Read()IsExternal()
System.Console.WriteLine(string)IsExternal(False)
object.ToString()IsExternal()
System.Data.OracleCli

In [5]:
num_test = int(len(examples) * 0.2)
print("num_test:", num_test)
train_data, train_labels = examples[num_test:], labels[num_test:]
dev_data, dev_labels = examples[:num_test], labels[:num_test]
print(len(dev_data))
print(len(train_data))

num_test: 6194
6194
24776


In [None]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
feature_vectors = vectorizer.fit_transform(train_data)
vocabulary = vectorizer.get_feature_names()
dev_vectorizer = CountVectorizer(vocabulary=vocabulary,token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None)
dev_feature_vectors = dev_vectorizer.fit_transform(dev_data)

# K Nearest Neighbors
n_neighbors = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=n_neighbors, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("K Nearest Neighbors")
print("Optimal value for k:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

# Multinomial Naive Bayes
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0] }
grid_search = GridSearchCV(MultinomialNB(), param_grid=alphas, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("Multinomial Naive Bayes")
print("Optimal value for alpha:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

# Logistic Regression
c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 1000.0]
c = {'C': c_values }
grid_search = GridSearchCV(LogisticRegression(), param_grid=c, cv=5)
grid_search.fit(feature_vectors, train_labels)
print("Logistic Regression")
print("Optimal value for C:", grid_search.best_params_)
predicted_labels = grid_search.best_estimator_.predict(dev_feature_vectors)
print("F1 score:", metrics.f1_score(dev_labels, predicted_labels, average='weighted'))
print("Accuracy: ", metrics.accuracy_score(dev_labels, predicted_labels))
print()

matrix = confusion_matrix(dev_labels, predicted_labels)
print(matrix)
max_errors = 0
most_confused_actual = -1
most_confused_predicted = -1
# Find the most confused digit pair
for actual in range(0, len(categories)):
    for predicted in range(0, len(categories)):
        if (actual != predicted and matrix[actual, predicted] > max_errors):
            max_errors = matrix[actual, predicted]
            most_confused_actual = actual
            most_confused_predicted = predicted
print ("Most confused pair is actual =", categories[most_confused_actual], ", predicted =", categories[most_confused_predicted])
print ("This error occurred", max_errors, "times")
# Print out examples of the confused digits
error_pair_count = 0
for index in range(0, len(dev_labels)):
    if (dev_labels[index] == most_confused_actual and predicted_labels[index] == most_confused_predicted and error_pair_count < 10):
        print(dev_data[index])
        print("-----------------------------")
        error_pair_count = error_pair_count + 1

In [None]:
def build_table(vectorizer):
    feature_vectors = vectorizer.fit_transform(train_data)
    c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    c = {'C': c_values }
    grid_search = GridSearchCV(LogisticRegression(), param_grid=c, cv=5)
    grid_search.fit(feature_vectors, train_labels)
    max_values = [None]*len(grid_search.best_estimator_.coef_)
    for class_index in range(0, len(grid_search.best_estimator_.coef_)):
        top_5 = np.argsort(grid_search.best_estimator_.coef_[class_index])[-5:]
        max_values[class_index] = top_5
    feature_names = vectorizer.get_feature_names()
    #print('{:<18}'.format(newsgroups_train.target_names[class_index]), '{:<20}'.format(weight), '{:<11}'.format(feature_names[feature_index])) 
    for class_index in range(0, len(grid_search.best_estimator_.coef_)):
        for feature_index in max_values[class_index]:
            weight = grid_search.best_estimator_.coef_[class_index][feature_index]
            print('{:<19}'.format(categories[class_index]), end = '')
            print('{:<20}'.format(feature_names[feature_index]), end = '')
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[0][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[1][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[2][feature_index]), end = ' ') 
            print('{:< .4f}'.format(grid_search.best_estimator_.coef_[3][feature_index])) 
            
build_table(CountVectorizer(token_pattern=r"(?u)\b\w[\w\.(),]+\b",stop_words=None))