In [113]:
import os
import csv 
import sys 
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET


from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util

In [114]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
    
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids

In [115]:
def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
    
    cols = []
    rows = []
    data = []        
    for i in xrange(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].iteritems():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict

In [116]:
def system_call_count_1_gram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    for el in tree.iter():
        c[el.tag] += 1
        c['num_system_calls'] += 1
    return c

In [117]:
def system_call_4_gram_feats(tree):
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    first_call = ""
    second = False
    second_call = ""
    third = False
    third_call = ""
    for el in tree.iter():
        if first:
            first_call = el.tag
            first = False
            second = True
            third = False
        elif second:
            second_call = el.tag
            second = False
            third = True
        elif third:
            third_call = el.tag
            third = False
        else:
            c[first_call+'-'+second_call+'-'+third_call+'-'+el.tag] += 1
            first_call = second_call
            second_call = third_call
            third_call = el.tag
    return c

In [118]:
def tokens_feats(tree):
    c = Counter()

    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            for k, v in el.attrib.iteritems():
                if ("hash" and "id" and "index" and "size" and "time") not in k:
                    if "file" in k:
                        c[path_leaf(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '').replace(' ', '')] += 1
                    elif "url" in k:
                        c[url_domain(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '')] += 1
                    elif ("key" or "name" or "target" or "command" or "socket" or "value") in k:
                        c[v.replace('.', '').replace('$', '').replace('_', '').replace('-', '').replace(' ', '')] += 1
    
    return c

In [119]:
ffs = [system_call_count_1_gram_feats, system_call_4_gram_feats]

In [120]:
train_dir = "../train"
test_dir = "../test"

In [129]:
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, "../train")

In [130]:
X_train.shape

(3086, 29813)

In [132]:
sparse.save_npz("X_train.npz", X_train)
np.save("t_train.npy", t_train)
np.save("train_ids.npy", train_ids)

In [133]:
y_train = np.zeros((len(t_train),len(util.malware_classes)))
y_train[np.arange(len(t_train)), t_train] = 1
y_train.shape

(3086, 15)

In [134]:
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)

In [135]:
X_test.shape

(3724, 29813)

In [136]:
sparse.save_npz("X_test.npz", X_test)
np.save("test_ids.npy", test_ids)