In [1]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

import util

TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = set([])

In [2]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [10]:
## Feature extraction
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 6172, TRAIN_DIR)
    print "\n"
    X_test, t_test, test_ids = create_data_matrix(0, 3724, TEST_DIR)
    return X_train, t_train, X_test, t_test, train_ids, test_ids

    # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).
    
def write_to_file(filename, pred_ids, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for pred_id, pred in zip(pred_ids, predictions):
            f.write(str(pred_id) + "," + str(pred) + "\n")
    
if __name__ == "__main__":
    X_train, t_train, X_test, t_test, train_ids, test_ids = main()
    
    #logreg = LogisticRegression(C=1e5)
    #logreg.fit(X_train, t_train)
    #preds = logreg.predict(X_test)
    
    clfForest, _, _, _, _, best_params = do_classify(clfForest, parameters, X_train, t_train, mask=True, n_folds = 2, 
                                                     n_jobs = 4, verbose=True)
    preds = clfForest.predict(X_test)
    write_to_file("submissions/RandForest.csv", test_ids, preds)

Reading from: train
Number of datafiles loaded: 6172 

Reading from: test
Number of datafiles loaded: 3724Fitting 2 folds for each of 1 candidates, totalling 2 fits
 BEST {'n_estimators': 100, 'max_depth': 25} 0.784259259259 [mean: 0.78426, std: 0.00261, params: {'n_estimators': 100, 'max_depth': 25}]
Training classes
[ 8  6 12 ...,  8  8  3]
Predicted classes
[8 6 8 ..., 8 8 3]
Training classes, cv
[ 8  8  8 ..., 12 10 10]
Predicted classes, cv
[ 8  8  8 ...,  8 10 10]
############# based on standard predict ################
Accuracy on training data: 0.8009
Accuracy on crossv data:   0.7797
########################################################


[Parallel(n_jobs=4)]: Done   2 out of   2 | elapsed:    0.9s finished


In [3]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc):
    X = None
    classes = []
    ids = [] 
    i = -1
    print "Reading from:", direc
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue
        
        if datafile[0] == '.':
            datafile = datafile[2:]
        
        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break 
        
        print "\rNumber of datafiles loaded:", i+1,
        
        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [4]:
def do_classify(clf, parameters, X, y, mask=None, n_folds=5, n_jobs=4, verbose=False, score_func='f1'):
    if mask is not None:
        mask = split_mask(X)
    X_train, X_cv, y_train, y_cv = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, X_train, y_train, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose)
    fit=clf.fit(X_train, y_train)
    train_preds = clf.predict(X_train)
    cv_preds = clf.predict(X_cv)
    training_accuracy = predictAccuracy(y_train, train_preds)
    cv_accuracy = predictAccuracy(y_cv, cv_preds)
    print "Training classes"
    print y_train
    print "Predicted classes"
    print train_preds
    print "Training classes, cv"
    print y_cv
    print "Predicted classes, cv"
    print cv_preds
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % training_accuracy
    print "Accuracy on crossv data:   %0.4f" % cv_accuracy
    print "########################################################"
    return fit, X_train, y_train, X_cv, y_cv, best_params_

In [5]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

In [6]:
clfForest = RandomForestClassifier()
parameters = {
    "n_estimators" : [100],
#    "n_estimators" : [100,200,400,800],
    "max_depth": [25]
#    "max_depth": [100,200,400,None]
}

In [8]:
def predictAccuracy(y1, y2):
    return sum(a == b for a,b in zip(y1,y2)) / float(len(y1))