In [13]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [10]:
def count_feats(ffs, direc="train"):
    for ff in ffs:
        values = set([])
        for datafile in os.listdir(direc):
            tree = ET.parse(os.path.join(direc,datafile))
            print ff(tree).keys()
            values.update(ff(tree).keys())
        print ff, len(values), values.pop()

In [11]:
def count_file_hash(tree):
    c = Counter()
    for el in tree.iter():
        if el.attrib.get("filename_hash") == None:
            continue
        elif el.attrib["filename_hash"] == "hash_error":
            c["file_hash_error"] += 1
    return c
        

def count_src_hash(tree):
    c = Counter()
    for el in tree.iter():
        if el.attrib.get("srcfile_hash") == None:
            continue
        elif el.attrib["srcfile_hash"] == "hash_error":
            c["src_hash_error"] += 1
    return c
        
    
def count_all_reasons(tree):
    c = Counter()
    for el in tree.iter():
        if el.tag == "process":
            c["term" + el.attrib["terminationreason"]] += 1
            c["start" + el.attrib["startreason"]] += 1
            c[el.attrib["executionstatus"]] += 1
    return c

def count_all_attrib(tree):
    c = Counter()
    for el in tree.iter():
        for attrib in el.attrib.keys():
            if attrib in ["shareaccess", "desiredaccess", "filetype", "flags", "protect"]:
                c[el.attrib[attrib]] += 1
            c[attrib] += 1
    return c


# def count_all_flags(tree):
#     c = Counter()
#     for el in tree.iter():
#         if el.attrib.get("flags") == None:
#             continue
#         else:
#             c[el.attrib["flags"]] += 1
#     return c


def count_all_sleep(tree):
    c = Counter()
    for el in tree.iter():
        if el.tag == "sleep":
            c[el.attrib["milliseconds"]] += 1
    return c

def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made.
      (in other words, it returns a dictionary indicating what the first and
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen

    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

def system_call_count_feats(tree):
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag] += 1
    return c

In [12]:
ffs = [count_file_hash, count_src_hash, count_all_reasons, count_all_attrib, count_all_sleep, first_last_system_call_feats, system_call_count_feats]
count_feats(ffs)

<function count_file_hash at 0x10769ee60> 1 file_hash_error


KeyboardInterrupt: 