In [3]:
import os
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util



In [30]:
from keras.layers import Dense # For dense layers
from keras.models import Sequential # For sequential layering
from keras.callbacks import EarlyStopping # For stopping execution
from sklearn.metrics import mean_squared_error

def train_model_DL(X_train,Y_train,n_nodes):
    """ n_nodes is 1-D numpy array with number of nodes on each layer
        e.g. [10,20,30] is a model with 3 (hidden) layers,
        with 10/20/30 nodes on the first/second/third layers
        Returns trained DL model """
    input_shape = (X_train.shape[1],) # Shape of input data
    # Initialize model
    model_DL = Sequential()
    for i in range(len(n_nodes)):
        if i == 0:
            # First layer
            model_DL.add(Dense(n_nodes[i],activation='relu',input_shape=input_shape))
        else:
            # Subsequent layers
            model_DL.add(Dense(n_nodes[i],activation='relu'))
    # Output layer
    model_DL.add(Dense(15, activation='softmax'))
    # Compile model
    model_DL.compile(optimizer='adam',loss='mean_squared_error')
    # Print model summary
    model_DL.summary()
    # Early stopping monitor w/ patience=3 (stop after 3 runs without improvements)
    early_stopping_monitor = EarlyStopping(patience=30)
    # Fit model using 20% of data for validation
    model_DL.fit(X_train, Y_train, validation_split=0.2, epochs=200, callbacks=[early_stopping_monitor])
    Y_train_DLpred = model_DL.predict(X_train)
    mse_DL = mean_squared_error(Y_train, Y_train_DLpred)
    print('DONE. Mean Squared Error: ', mse_DL)
    return model_DL

In [3]:
def get_tags(direc, useFirstThreadOnly = True):
    tags_set = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        in_all_section = False
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        tags = ""
        for el in tree.iter():
            # ignore everything outside the "all_section" element
            if el.tag == "all_section" and not in_all_section:
                in_all_section = True
            elif el.tag == "all_section" and in_all_section:
                in_all_section = False
                if useFirstThreadOnly is True:
                    break
            elif in_all_section:
                tags += " "
                tags += el.tag
                
        tags_set.append(tags.replace('_', ''))
    
    return tags_set, np.array(classes), ids

In [10]:
tree = ET.parse("exp/eb93b1ae3c2e3acdc2bfb8f57696454123dbeafac.Virut.xml")
for el in tree.iter():
    print el.tag
    for k, v in el.attrib.iteritems():
        print "\t"+ k + ": " + v

processes
process
	username: Administrator
	index: 1
	sha1: eb93b1ae32e3acdc2bfb8f57696454123dbeafac
	terminationtime: 00:04.187
	filename_hash: hash_error
	pid: 2432
	startreason: AnalysisTarget
	filename: c:\9f21291e2734c13be0859d1948968909.EX
	filesize: 77312
	starttime: 00:01.328
	terminationreason: NormalTermination
	executionstatus: OK
	applicationtype: Win32Application
	parentindex: 0
	md5: 9f21291e2734c13be0859d1948968909
thread
	tid: 2436
all_section
load_image
	successful: 1
	end_address: $1015000
	filename_hash: hash_error
	filename: c:\9f21291e2734c13be0859d1948968909.EX
	address: $1000000
	size: 86016
load_dll
	successful: 1
	end_address: $7C9C9000
	filename_hash: e753d19a2e3b98b2b3b8f02f276092096d10f22d
	filename: C:\WINDOWS\system32\ntdll.dll
	address: $7C910000
	size: 757760
load_dll
	successful: 1
	end_address: $7C908000
	filename_hash: c88d57cc99f75cd928b47b6e444231f26670138f
	filename: C:\WINDOWS\system32\kernel32.dll
	address: $7C800000
	size: 1081344
load_dll
	succ

In [4]:
train_tags, train_classes, _ = get_tags('../train_origin')

In [5]:
test_tags, _, ids = get_tags('../test_origin')

In [6]:
y_train = np.zeros((len(train_classes),len(util.malware_classes)))
y_train[np.arange(len(train_classes)), train_classes] = 1
y_train.shape

(3086, 15)

In [7]:
TF = TfidfVectorizer(analyzer = 'word', max_df = 0.9)

In [8]:
X_train = TF.fit_transform(train_tags)

In [9]:
X_train.shape

(3086, 88)

In [10]:
print len(TF.stop_words_)
TF.stop_words_

5


{u'getsystemdirectory', u'loaddll', u'loadimage', u'openkey', u'queryvalue'}

In [11]:
sparse.save_npz("tfidf_train_tag.npz", X_train)

In [12]:
# X = sparse.load_npz("tfidf_train.npz")
# X.shape

In [13]:
len(TF.stop_words_)

5

In [14]:
X_test = TF.transform(test_tags)
X_test.shape

(3724, 88)

In [15]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train, y_train)
print RF.n_features_
print RF.feature_importances_
print RF.score(X_train, y_train)

88
[1.30700209e-03 1.16753866e-03 1.94854565e-02 1.68050440e-02
 5.09195092e-06 1.99139665e-02 2.14366417e-03 1.22265887e-03
 6.58201527e-04 6.80766948e-03 1.72711622e-02 1.84561863e-02
 8.47243403e-05 6.30777594e-03 5.90563055e-05 1.93416713e-02
 1.23057405e-02 2.42862286e-02 1.39229339e-05 1.94541226e-03
 2.21544723e-03 1.70128308e-02 4.87466695e-02 2.85639280e-02
 9.67714433e-03 3.99651343e-03 1.05462630e-04 3.23464126e-03
 3.15756050e-02 4.30424445e-04 1.32635920e-03 1.38638144e-05
 7.89779131e-05 4.99544656e-03 3.67282157e-03 4.74345974e-03
 3.22886383e-05 5.52612812e-04 1.00919534e-04 4.93466790e-05
 9.12519895e-03 4.93960048e-02 2.80510900e-05 3.20180447e-02
 4.30915299e-02 4.24321273e-03 2.94854100e-02 0.00000000e+00
 2.42046741e-03 5.27373789e-03 0.00000000e+00 2.40685344e-02
 2.43194003e-02 2.82306311e-03 2.12776501e-02 2.91238112e-05
 0.00000000e+00 1.56297319e-03 3.46906926e-02 4.59191384e-03
 8.63818576e-03 3.38247331e-03 2.06597104e-03 1.56488755e-03
 3.54840942e-02 5.954

In [16]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(n_jobs = -1)
SGD.fit(X_train, train_classes)
print SGD.score(X_train, train_classes)



0.8425145819831497


In [31]:
model_DL = train_model_DL(X_train,y_train,[200, 200])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 200)               17800     
_________________________________________________________________
dense_14 (Dense)             (None, 200)               40200     
_________________________________________________________________
dense_15 (Dense)             (None, 15)                3015      
Total params: 61,015
Trainable params: 61,015
Non-trainable params: 0
_________________________________________________________________
Train on 2468 samples, validate on 618 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200


In [34]:
print "making predictions..."
preds1 = RF.predict(X_test)
results1 = np.argmax(preds1, axis=1)

results2 = SGD.predict(X_test)

preds3 = model_DL.predict(X_test)
results3 = np.argmax(preds3, axis=1)

making predictions...


In [35]:
print "writing predictions..."
outputfile1 = "tfidf_tags_predictions1.csv"
outputfile2 = "tfidf_tags_predictions2.csv"
outputfile3 = "tfidf_tags_predictions3.csv"
util.write_predictions(results1, ids, outputfile1)
util.write_predictions(results2, ids, outputfile2)
util.write_predictions(results3, ids, outputfile3)

writing predictions...


In [36]:
import csv 
import sys 

def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [37]:
reorder_submission(outputfile1, "tfidf_tags_rf_results.csv")
reorder_submission(outputfile2, "tfidf_tags_sgd_results.csv")
reorder_submission(outputfile3, "tfidf_tags_nn_results.csv")

Reordered tfidf_tags_predictions1.csv and wrote to tfidf_tags_rf_results.csv
Reordered tfidf_tags_predictions2.csv and wrote to tfidf_tags_sgd_results.csv
Reordered tfidf_tags_predictions3.csv and wrote to tfidf_tags_nn_results.csv
