In [2]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that
      the columns of the test matrix align correctly.

    returns:
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order
      of their rows in the design matrix.

      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = []
    #x = os.listdir(direc)[1:]
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)

    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids


def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that
      the columns of the test matrix align correctly.

    returns:
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(list(fd.keys())) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict

    cols = []
    rows = []
    data = []
    for i in range(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in list(fds[i].items()):
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)


    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict

In [8]:
def count_all_reasons(tree):
    c = Counter()
    for el in tree.iter():
        if el.tag == "process":
            c["term" + el.attrib["terminationreason"]] += 1
            c["start" + el.attrib["startreason"]] += 1
            c[el.attrib["executionstatus"]] += 1
    return c

def count_all_flags(tree):
    c = Counter()
    for el in tree.iter():
        if el.attrib.get("flags") == None:
            continue
        else:
            c[el.attrib["flags"]] += 1
    return c

def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made.
      (in other words, it returns a dictionary indicating what the first and
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen

    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

def system_call_count_feats(tree):
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag] += 1
    return c

In [9]:
ffs = [first_last_system_call_feats, system_call_count_feats, count_all_reasons, count_all_flags]
X_train, global_feat_dict, t_train, train_ids = extract_feats(ffs, "train")

In [10]:
params = {'max_depth': [5, 10, 25, None],
          'min_samples_split': [2, 5, 10, 15, 20],
          'min_samples_leaf': [1, 5, 10, 15, 20]}
          #'max_features': ['auto', None]}
rfc = RandomForestClassifier(n_estimators=500, random_state=50)
gs = GridSearchCV(rfc, params, n_jobs=-1, scoring='accuracy', cv=4, verbose=3)
gs.fit(X_train, t_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  7.1min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=50, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [5, 10, 25, None], 'min_samples_split': [2, 5, 10, 15, 20], 'min_samples_leaf': [1, 5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [11]:
gs.best_params_, gs.best_score_

({'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2},
 0.8956578094620868)

In [12]:
params2 = {'max_depth': [15, 25, 45, 65, None],
           'max_features': ['auto', None]}
gs2 = GridSearchCV(rfc, params2, n_jobs=-1, scoring='accuracy', cv=4, verbose=3)
gs2.fit(X_train, t_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  4.6min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=50, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [15, 25, 45, 65, None], 'max_features': ['auto', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [14]:
gs2.best_params_, gs2.best_score_

({'max_depth': 45, 'max_features': 'auto'}, 0.8924173687621516)

In [15]:
params3 = {'max_depth': [35, 45, 55]}
gs3 = GridSearchCV(rfc, params3, n_jobs=-1, scoring='accuracy', cv=4, verbose=3)
gs3.fit(X_train, t_train)
gs3.best_params_, gs3.best_score_

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV] max_depth=35 ....................................................
[CV] max_depth=35 ....................................................
[CV] max_depth=35 ....................................................
[CV] max_depth=35 ....................................................
[CV] ........................... max_depth=35, score=0.887306 -   7.8s
[CV] max_depth=45 ....................................................
[CV] ........................... max_depth=35, score=0.880463 -   7.9s
[CV] max_depth=45 ....................................................
[CV] ........................... max_depth=35, score=0.898570 -   8.2s
[CV] max_depth=45 ....................................................
[CV] ........................... max_depth=35, score=0.894394 -   8.3s
[CV] max_depth=45 ....................................................
[CV] ........................... max_depth=45, score=0.887306 -   7.2s
[CV] max_depth=55

[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:   22.7s remaining:    4.5s


[CV] ........................... max_depth=55, score=0.899870 -   7.2s
[CV] ........................... max_depth=55, score=0.894394 -   7.1s


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   23.3s finished


({'max_depth': 35}, 0.890149060272197)