In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

# Read malware data from csmining dataset in csv format
malware_data = pd.read_csv("CSDMC_API_Train.csv")


In [2]:
# TODO: Calculate number of malware traces
n_traces = len(malware_data)

# TODO: Calculate number of features
n_features = len(malware_data.columns) - 1 


print malware_data.columns
target = malware_data['Type']

# Print the results
print "Total number of traces: {}".format(n_traces)
print "Total number of types: {}".format(len(target))


Index([u'Type', u' APICalls'], dtype='object')
Total number of traces: 388
Total number of types: 388


In [3]:
#for each record create a list of APICalls only 
Apicalls = malware_data[' APICalls']
print Apicalls.head()
# Print the results
print "Total number of APICalls list: {}".format(len(Apicalls))

0    LoadLibraryW HeapAlloc HeapAlloc HeapFree Heap...
1    RegOpenKeyExW LoadLibraryA GetProcAddress GetP...
2    HeapAlloc HeapFree HeapAlloc HeapAlloc HeapFre...
3    HeapAlloc HeapFree HeapAlloc HeapAlloc HeapFre...
4    HeapAlloc HeapFree HeapAlloc HeapAlloc HeapFre...
Name:  APICalls, dtype: object
Total number of APICalls list: 388


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline


#create a pipeline, which uses a countvectorizer, a tfidftransformer and a Multinomial Naive Bayes classifier to classify 
#the software 

#a countvectorizer is used to convert the APICalls to a numeric form, with the count of an APICall tied to its index.
#a tfidftransformer is used to convert the counts from the countvectorizer into term frequencies with words adjusted by inverse
#document frequency.

# A pipeline in scikit learn facilitates the data transfer between various functions without having to do the same 
# explicitly

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])

In [14]:
from sklearn.cross_validation import train_test_split

#split the training data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(Apicalls, target, test_size=.2, random_state =1)

In [15]:
#fit the Multinomial Naive Bayes classifier with training data
text_clf = text_clf.fit(X_train, y_train)

#use the trained Multinomial Naive Bayes classifier to predict the type of test data
predicted = text_clf.predict(X_test)

#determine the accuracy of the predicted values
np.mean(predicted == y_test)

0.84615384615384615

In [16]:
from sklearn.linear_model import SGDClassifier

#create a pipeline, which uses a countvectorizer, a tfidftransformer and a SGDclassifier to classify the software
#SGDClassifier implements linear models with stochastic gradient descent learning.  The loss function of Hinge 
#gives a Linear SVM
#penalty = I2 is a standard regularizer for linear SVM
#alpha is 
#n_iter is the number of passes over the training set

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),])

In [17]:
#fit SGD based linear SVM classifier with training data
text_clf = text_clf.fit(X_train, y_train)

#use the trained SVM classifier to predict the type of test data
predicted = text_clf.predict(X_test)

#determine the accuracy of the predicted values
np.mean(predicted == y_test)

0.89743589743589747

In [18]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted,target_names=['Benign','Malware']))

             precision    recall  f1-score   support

     Benign       0.83      0.42      0.56        12
    Malware       0.90      0.98      0.94        66

avg / total       0.89      0.90      0.88        78

