In [1]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
from sklearn.cross_validation import cross_val_score

import util



In [2]:
import ntpath
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [3]:
from urlparse import urlparse
def url_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri)
    return domain

In [4]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
    
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids

In [5]:
def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
    
    cols = []
    rows = []
    data = []        
    for i in xrange(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].iteritems():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict

In [6]:
## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

In [7]:
def system_call_count_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    return c

In [8]:
def system_call_4_grams_feats(tree):
    c = Counter()
    in_all_section = False
    first = False # is this the first system call
    first_call = ""
    second = False
    second_call = ""
    third = False
    third_call = ""
    call_counter = 0
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
            first = True
            second = False
            third = False
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
            first = False
            second = False
            third = False
        elif in_all_section:
            call_counter += 1
            if first:
                first_call = el.tag
                first = False
                second = True
                third = False
            elif second:
                second_call = el.tag
                second = False
                third = False
            elif third:
                third_call = el.tag
                third = False
            else:
                c[first_call+'-'+second_call+'-'+third_call+'-'+el.tag] += 1
                first_call = second_call
                second_call = third_call
                third_call = el.tag
                
    for k, v in c.items():
        c[k] = v / float(call_counter)
    c['num_system_calls'] = call_counter
    return c

In [9]:
def system_call_2_grams_feats(tree):
    c = Counter()
    in_all_section = False
    first = False # is this the first system call
    first_call = ""
    call_counter = 0
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
            first = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
            first = False
        elif in_all_section:
            call_counter += 1
            if first:
                if el.tag == "load_dll" and "filename_hash" in el.attrib:
                    first_call = el.attrib["filename_hash"]
                else:
                    first_call = el.tag
                first = False
            else:
                sys_call = ""
                if el.tag == "load_dll" and "filename_hash" in el.attrib:
                    sys_call = el.attrib["filename_hash"]
                else:
                    sys_call = el.tag
                c[first_call+'-'+sys_call] += 1
                first_call = sys_call
                
    for k, v in c.items():
        c[k] = v / float(call_counter)
    c['num_system_calls'] = call_counter
    return c

In [10]:
def suspicicous_key_words(tree):
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if el.tag == "create_file" or el.tag == "create_open_file" or el.tag == "create_directory" or el.tag == "delete_file":
                if "srcfile" in el.attrib:
                    c[el.tag+'-'+path_leaf(el.attrib["srcfile"])] += 1
            elif el.tag == "create_window":
                if "classname" in el.attrib:
                    c[el.tag+'-'+el.attrib["classname"]] += 1
            elif el.tag == "create_process":
                if "filename" in el.attrib:
                    c[el.tag+'-'+path_leaf(el.attrib["filename"])] += 1
                if "commandline" in el.attrib:
                    c[el.tag+'-'+el.attrib["commandline"]] += 1
            elif el.tag == "create_service":
                if "servicename" in el.attrib:
                    c[el.tag+'-'+el.attrib["servicename"]] += 1
                if "filename" in el.attrib:
                    c[el.tag+'-'+path_leaf(el.attrib["filename"])] += 1
            elif el.tag == "create_socket":
                if "socket" in el.attrib:
                    c[el.tag+'-'+el.attrib["socket"]] += 1  
            elif el.tag == "create_key":
                if "key" in el.attrib:
                    c[el.tag+'-'+el.attrib["key"]] += 1
            elif el.tag == "open_url":
                if "url" in el.attrib:
                    c[el.tag+'-'+url_domain(el.attrib["url"])] += 1

    return c

In [11]:
#############################
#
#  Below is main function
#
#############################

In [12]:
train_dir = "../train_origin"
test_dir = "../test_origin"
outputfile = "experiment_predictions.csv"  # feel free to change this or take it as an argument

In [13]:
# TODO put the names of the feature functions you've defined above in this list
ffs = [suspicicous_key_words]

In [14]:
# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)

extracting training features...


In [27]:
print "extracting test features..."
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)

extracting test features...


In [15]:
y_train = np.zeros((len(t_train),len(util.malware_classes)))
y_train[np.arange(len(t_train)), t_train] = 1
y_train.shape

(3086, 15)

In [30]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
RF.fit(X_train, y_train)
print RF.n_features_
print RF.score(X_train, y_train)

16636
0.968567725211


In [31]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_test)
X_train_new.shape

(3086, 1519)

In [32]:
RF.fit(X_train_new, y_train)
print RF.n_features_
print RF.score(X_train_new, y_train)

1519
0.968243681141


In [33]:
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train_new)
X_test_new = model.transform(X_test_new)
X_train_new.shape

(3086, 185)

In [34]:
RF.fit(X_train_new, y_train)
print RF.n_features_
print RF.score(X_train_new, y_train)

185
0.890797148412


In [35]:
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train_new)
X_test_new = model.transform(X_test_new)
X_train_new.shape

(3086, 31)

In [36]:
RF.fit(X_train_new, y_train)
print RF.n_features_
print RF.score(X_train_new, y_train)

31
0.857420609203


In [37]:
preds = RF.predict(X_test_new)

In [38]:
preds[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.])

In [39]:
# TODO make predictions on text data and write them out
print "making predictions..."
results = np.argmax(preds, axis=1)

print "writing predictions..."
util.write_predictions(results, test_ids, outputfile)

making predictions...
writing predictions...


In [40]:
import csv 
import sys 

def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [41]:
reorder_submission(outputfile, "suspicious_rf_results4.csv")

Reordered experiment_predictions.csv and wrote to suspicious_rf_results4.csv
