In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from GaussianGenerativeModel import GaussianGenerativeModel as GGM
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [3]:
# Imports features
data = pd.read_csv("data/featuresFromStrings.csv")
data = data.drop(['id'], axis = 1)

In [4]:
data.head()

Unnamed: 0,dump_line,download_file,open_file,connect_socket,impersonate_user,sex,open_process,load_dll,kill_process,destroy_window,query_value,vm_protect,FILE_ANY_ACCESS,SECURITY_ANONYMOUS,create_open_file,Windows Desktop Search,find_file,classnum
0,0,0,15,0,0,0,2,135,2,42,242,36,39,39,0,275,6,8
1,3434,0,275,12,12,2,28,467,2,65,2004,365,809,809,34,549,85,6
2,0,0,0,0,0,0,0,26,1,3,7,0,8,8,0,0,5,12
3,0,0,21,0,0,0,4,86,0,1,17,54,161,156,0,0,41,8
4,636,0,27,2,0,0,0,130,1,0,35,25,29,29,15,0,0,10


In [6]:
def calcAccuracy(pred, obs):
    assert (pred.shape[0] == obs.shape[0])
    correct = 0
    for i in xrange(pred.shape[0]):
        if pred[i] == obs[i]:
            correct += 1
    return 100.0 * correct / pred.shape[0]

In [14]:
def crossValidation(n, model):
    """
    n: number of cross validation folds
    model: model that we will use. Must have model.fit and model.predict
    """
    # Determines size of blocks
    dataSize = data.shape[0]
    blockSize = dataSize / n
    
    # Percent correct
    accuracy = []
    
    for i in xrange(n):
        print i,
        # Calculate start and end indices of validation data
        start = i * blockSize
        end = start + blockSize
        
        # Training and test data
        featsTrain = pd.concat((data.iloc[0:start], data.iloc[end:dataSize]), axis=0)
        featsTest = data.iloc[start:end]
        
        # Store classes
        classTrain = featsTrain.classnum.values
        featsTrain = featsTrain.drop(['classnum'], axis = 1)
        classTest = featsTest.classnum.values
        featsTest = featsTest.drop(['classnum'], axis = 1)
        
        # Training / testing values
        valsTrain = featsTrain.values
        valsTest = featsTest.values
        
        # Classifier goes here
        C = model
        C.fit(valsTrain, classTrain)
        classPred = C.predict(valsTest)
        
        # Calculate accuracy
        accuracy.append(calcAccuracy(classPred, classTest))
        
    print ""  # this is to make a new line
    print "Mean: " + str(np.mean(accuracy))
    print "Variance: " + str(np.var(accuracy))
    print "Std Dev: " + str(np.std(accuracy))

In [15]:
crossValidation(20, GGM())

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 
Mean: 37.6623376623
Variance: 103.516613257
Std Dev: 10.174311439


In [36]:
# Gausian naieve bayes model
GNB = GaussianNB()
crossValidation(20, GNB)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 
Mean: 26.461038961
Variance: 19.4752487772
Std Dev: 4.41307701918


In [41]:
# random forest classifier
RF = RandomForestClassifier()
crossValidation(20, RF)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 
Mean: 88.961038961
Variance: 5.05987518975
Std Dev: 2.24941663321


In [42]:
# adaboost classifier
AB = AdaBoostClassifier()
crossValidation(20, AB)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 
Mean: 68.9285714286
Variance: 9.54102715466
Std Dev: 3.08885531462
