In [64]:
import os
import csv 
import sys 
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET


from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util

In [65]:
def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [66]:
from keras.layers import Dense # For dense layers
from keras.models import Sequential # For sequential layering
from keras.callbacks import EarlyStopping # For stopping execution
from sklearn.metrics import accuracy_score

def train_model_DL(X_train, Y_train, n_nodes, hid_activation = 'sigmoid', out_activation = 'softmax', optimizer = 'adadelta', loss = 'categorical_crossentropy'):
    """ n_nodes is 1-D numpy array with number of nodes on each layer
        e.g. [10,20,30] is a model with 3 (hidden) layers,
        with 10/20/30 nodes on the first/second/third layers
        Returns trained DL model """
    input_shape = (X_train.shape[1],) # Shape of input data
    # Initialize model
    model_DL = Sequential()
    for i in range(len(n_nodes)):
        if i == 0:
            # First layer
            model_DL.add(Dense(n_nodes[i], activation = hid_activation, input_shape = input_shape))
        else:
            # Subsequent layers
            model_DL.add(Dense(n_nodes[i],activation = hid_activation))
    # Output layer
    model_DL.add(Dense(15, activation = out_activation))
    # Compile model
    model_DL.compile(optimizer = optimizer,loss = loss)
    # Print model summary
    model_DL.summary()
    # Early stopping monitor w/ patience=3 (stop after 3 runs without improvements)
    early_stopping_monitor = EarlyStopping(patience=5)
    # Fit model using 20% of data for validation
    model_DL.fit(X_train, Y_train, validation_split=0.2, epochs=200, callbacks=[early_stopping_monitor])
    Y_train_DLpred = model_DL.predict(X_train)
    acc_DL = accuracy_score(Y_train, Y_train_DLpred)
    print('DONE. Accuracy: ', acc_DL)
    return model_DL

In [67]:
import ntpath
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [68]:
from urlparse import urlparse
def url_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri)
    return domain

In [69]:
def get_tokens(direc, useFirstThreadOnly):
    tags_set = []
    values_set = []
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        in_all_section = False
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        tags = ""
        values = ""
        for el in tree.iter():
            # ignore everything outside the "all_section" element
            if el.tag == "all_section" and not in_all_section:
                in_all_section = True
            elif el.tag == "all_section" and in_all_section:
                in_all_section = False
                if useFirstThreadOnly is True:
                    break
            elif in_all_section:
                tags += " " + el.tag.replace('_', '')
                for k, v in el.attrib.iteritems():
                    if ("hash" and "id" and "index" and "size" and "time") not in k:
                        if "file" in k:
                            values += " " + path_leaf(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                        elif "url" in k:
                            values += " " + url_domain(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                        elif ("key" or "name" or "target" or "command" or "socket" or "value") in k:
                            values += " " + v.replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                
        tags_set.append(tags)
        values_set.append(values)
    
    assert len(tags_set) == len(values_set)
    return tags_set, values_set, np.array(classes), ids

In [70]:
def get_tags_bigram(tags_set):
    tags_bigram_set = []
    for tags_entry in tags_set:
        tags = tags_entry.split(' ')
        pre_tag = ""
        tags_bigram = ""
        for tag in tags:
            if pre_tag is not "":
                tags_bigram += " " + pre_tag + tag
            pre_tag = tag
        tags_bigram_set.append(tags_bigram)
    return tags_bigram_set

In [71]:
def get_tags_3gram(tags_set):
    tags_3gram_set = []
    for tags_entry in tags_set:
        tags = tags_entry.split(' ')
        first_tag = ""
        second_tag = ""
        tags_3gram = ""
        for tag in tags:
            if first_tag is not "":
                tags_3gram += " " + first_tag + second_tag+tag
            first_tag = second_tag
            second_tag = tag
        tags_3gram_set.append(tags_3gram)
    return tags_3gram_set

In [72]:
def get_all_tokens(direc, useFirstThreadOnly = False):
    tags_set, values_set, classes, ids = get_tokens(direc, useFirstThreadOnly)
    tags_bigram = get_tags_bigram(tags_set)
    tags_3gram = get_tags_3gram(tags_set)
    assert len(tags_set) == len(tags_bigram) == len(tags_3gram)
    tokens_set = []
    for i in range(len(tags_set)):
        tokens_set.append(tags_set[i] + ' ' + values_set[i] + ' ' + tags_bigram[i] + ' ' + tags_3gram[i])
    return tokens_set, classes, ids

In [73]:
#############################
#
#  Below is main function
#
#############################

In [74]:
train_dir = "../train_origin"
test_dir = "../test_origin"
outputfile = "experiment_predictions.csv"

In [75]:
train_tokens, train_classes, _ = get_all_tokens('../train_origin')

In [76]:
test_tokens, _, ids = get_all_tokens('../test_origin')

In [77]:
y_train = np.zeros((len(train_classes),len(util.malware_classes)))
y_train[np.arange(len(train_classes)), train_classes] = 1
y_train.shape

(3086, 15)

In [78]:
TF = TfidfVectorizer(analyzer = 'word')

In [79]:
X_train = TF.fit_transform(train_tokens)
print X_train.shape

(3086, 68571)


In [80]:
X_test = TF.transform(test_tokens)
print X_test.shape

(3724, 68571)


In [None]:
########################
#
# Finish collecting data
#
########################

In [116]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train, y_train)
scores = cross_val_score(RF, X_train, y_train, cv=4)
print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Features: 68571	Accuracy: 0.85029 (+/- 0.00752)


In [119]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_test)

In [120]:
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train_new, y_train)
scores = cross_val_score(RF, X_train_new, y_train, cv=4)
print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Features: 3994	Accuracy: 0.85483 (+/- 0.00820)


In [114]:
while X_train_new.shape[1] > 1000:
    model = SelectFromModel(RF, prefit=True)
    X_train_new = model.transform(X_train_new)
    X_test_new = model.transform(X_test_new)
    
    RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
    RF.fit(X_train_new, y_train)
    scores = cross_val_score(RF, X_train_new, y_train, cv=4)
    print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Features: 662	Accuracy: 0.85418 (+/- 0.01714)


In [118]:
preds_RF = RF.predict(X_test_new)
results_RF = np.argmax(preds_RF, axis=1)
util.write_predictions(results_RF, ids, "tdidf_tokens_rf.csv")
reorder_submission("tdidf_tokens_rf.csv", "tdidf_tokens_rf_results.csv")

Reordered tdidf_tokens_rf.csv and wrote to tdidf_tokens_rf_results.csv


In [122]:
# Use 3994 features
model_DL = train_model_DL(X_train_new,y_train,[2000, 2000, 2000])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 2000)              7990000   
_________________________________________________________________
dense_7 (Dense)              (None, 2000)              4002000   
_________________________________________________________________
dense_8 (Dense)              (None, 2000)              4002000   
_________________________________________________________________
dense_9 (Dense)              (None, 15)                30015     
Total params: 16,024,015
Trainable params: 16,024,015
Non-trainable params: 0
_________________________________________________________________
Train on 2468 samples, validate on 618 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epo

Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 1

Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [None]:
preds_NN = model_DL.predict(X_test_new)
results_NN = np.argmax(preds_NN, axis=1)
util.write_predictions(results_NN, ids, "tdidf_tokens_nn.csv")
reorder_submission("tdidf_tokens_nn.csv", "tdidf_tokens_nn_results.csv")

In [None]:
sparse.save_npz("tfidf_train_tokens.npz", X_train_new)
sparse.save_npz("tfidf_test_tokens.npz", X_test_new)