In [1]:
import os
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util



In [2]:
from keras.layers import Dense # For dense layers
from keras.models import Sequential # For sequential layering
from keras.callbacks import EarlyStopping # For stopping execution
from sklearn.metrics import mean_squared_error

def train_model_DL(X_train,Y_train,n_nodes):
    """ n_nodes is 1-D numpy array with number of nodes on each layer
        e.g. [10,20,30] is a model with 3 (hidden) layers,
        with 10/20/30 nodes on the first/second/third layers
        Returns trained DL model """
    input_shape = (X_train.shape[1],) # Shape of input data
    # Initialize model
    model_DL = Sequential()
    for i in range(len(n_nodes)):
        if i == 0:
            # First layer
            model_DL.add(Dense(n_nodes[i],activation='relu',input_shape=input_shape))
        else:
            # Subsequent layers
            model_DL.add(Dense(n_nodes[i],activation='relu'))
    # Output layer
    model_DL.add(Dense(15, activation='softmax'))
    # Compile model
    model_DL.compile(optimizer='adam',loss='mean_squared_error')
    # Print model summary
    model_DL.summary()
    # Early stopping monitor w/ patience=3 (stop after 3 runs without improvements)
    early_stopping_monitor = EarlyStopping(patience=30)
    # Fit model using 20% of data for validation
    model_DL.fit(X_train, Y_train, validation_split=0.2, epochs=200, callbacks=[early_stopping_monitor])
    Y_train_DLpred = model_DL.predict(X_train)
    mse_DL = mean_squared_error(Y_train, Y_train_DLpred)
    print('DONE. Mean Squared Error: ', mse_DL)
    return model_DL

Using TensorFlow backend.


In [3]:
def get_tags(direc, useFirstThreadOnly = False):
    tags_set = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        in_all_section = False
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        tags = ""
        preTag = ""
        for el in tree.iter():
            # ignore everything outside the "all_section" element
            if el.tag == "all_section" and not in_all_section:
                in_all_section = True
            elif el.tag == "all_section" and in_all_section:
                in_all_section = False
                if useFirstThreadOnly is True:
                    break
            elif in_all_section:
                if preTag is not "":
                    tags += " "
                    tags += preTag+el.tag
                preTag = el.tag
                
        tags_set.append(tags.replace('_', ''))
    
    return tags_set, np.array(classes), ids

In [4]:
train_tags, train_classes, _ = get_tags('../train_origin')

In [5]:
test_tags, _, ids = get_tags('../test_origin')

In [6]:
y_train = np.zeros((len(train_classes),len(util.malware_classes)))
y_train[np.arange(len(train_classes)), train_classes] = 1
y_train.shape

(3086, 15)

In [7]:
TF = TfidfVectorizer(analyzer = 'word', max_df = 0.9)

In [8]:
X_train = TF.fit_transform(train_tags)

In [9]:
X_train.shape

(3086, 2171)

In [10]:
print len(TF.stop_words_)
TF.stop_words_

12


{u'checkfordebuggerloaddll',
 u'createmutexcreatemutex',
 u'createmutexopenkey',
 u'loaddllgetsystemdirectory',
 u'loaddllloaddll',
 u'loadimageloaddll',
 u'openkeyopenkey',
 u'openkeyqueryvalue',
 u'queryvaluecreatemutex',
 u'queryvaluegetsystemdirectory',
 u'queryvalueopenkey',
 u'queryvaluequeryvalue'}

In [11]:
X_test = TF.transform(test_tags)
X_test.shape

(3724, 2171)

In [12]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train, y_train)
print RF.n_features_
scores = cross_val_score(RF, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

2171
Accuracy: 0.84964 (+/- 0.02292)


In [13]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_train)
X_train_new.shape

(3086, 363)

In [14]:
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train_new, y_train)
scores = cross_val_score(RF, X_train_new, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.85613 (+/- 0.02504)


In [15]:
while X_train_new.shape[1] > 1:
    model = SelectFromModel(RF, prefit=True)
    X_train_new = model.transform(X_train_new)
    X_test_new = model.transform(X_test_new)
    
    RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score=True)
    RF.fit(X_train_new, y_train)
    scores = cross_val_score(RF, X_train_new, y_train, cv=5)
    print "n_features: " + str(X_train_new.shape[1]) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

n_features: 92	Accuracy: 0.84640 (+/- 0.02420)
n_features: 25	Accuracy: 0.84478 (+/- 0.03832)
n_features: 14	Accuracy: 0.82015 (+/- 0.04977)
n_features: 5	Accuracy: 0.77738 (+/- 0.03376)
n_features: 3	Accuracy: 0.69151 (+/- 0.04030)
n_features: 2	Accuracy: 0.57939 (+/- 0.05197)
n_features: 1	Accuracy: 0.49222 (+/- 0.04618)


In [15]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(n_jobs = -1)
SGD.fit(X_train, train_classes)
print SGD.score(X_train, train_classes)



0.861309138043


In [16]:
model_DL = train_model_DL(X_train,y_train,[200, 200])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               17800     
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_3 (Dense)              (None, 15)                3015      
Total params: 61,015
Trainable params: 61,015
Non-trainable params: 0
_________________________________________________________________
Train on 2468 samples, validate on 618 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200


In [34]:
print "making predictions..."
preds1 = RF.predict(X_test)
results1 = np.argmax(preds1, axis=1)

results2 = SGD.predict(X_test)

preds3 = model_DL.predict(X_test)
results3 = np.argmax(preds3, axis=1)

making predictions...


In [35]:
print "writing predictions..."
outputfile1 = "tfidf_tags_predictions1.csv"
outputfile2 = "tfidf_tags_predictions2.csv"
outputfile3 = "tfidf_tags_predictions3.csv"
util.write_predictions(results1, ids, outputfile1)
util.write_predictions(results2, ids, outputfile2)
util.write_predictions(results3, ids, outputfile3)

writing predictions...


In [36]:
import csv 
import sys 

def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [37]:
reorder_submission(outputfile1, "tfidf_tags_rf_results.csv")
reorder_submission(outputfile2, "tfidf_tags_sgd_results.csv")
reorder_submission(outputfile3, "tfidf_tags_nn_results.csv")

Reordered tfidf_tags_predictions1.csv and wrote to tfidf_tags_rf_results.csv
Reordered tfidf_tags_predictions2.csv and wrote to tfidf_tags_sgd_results.csv
Reordered tfidf_tags_predictions3.csv and wrote to tfidf_tags_nn_results.csv


In [11]:
sparse.save_npz("tfidf_train_tag.npz", X_train)

In [12]:
# X = sparse.load_npz("tfidf_train.npz")
# X.shape