In [1]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import util
import re

In [2]:
TRAIN_DIR = "train"
call_set = set([])

In [10]:
# CHANGE IF NEED DIFFERENT FEATURES
desired_features = ['dump_line', 'download_file', 'open_file', 'connect_socket', 'impersonate_user', 'sex', 
                    'open_process', 'load_dll', 'kill_process', 'destroy_window', 'query_value', 'vm_protect', 
                   'FILE_ANY_ACCESS', 'SECURITY_ANONYMOUS', 'create_open_file', 'Windows Desktop Search', 'find_file', 
                    'open_key']

In [4]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

In [5]:
def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i % 100 == 0:
            print i
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

In [6]:
def call_feats(tree):
    s = ET.tostring(tree.getroot())
    features = []
    
    for i in xrange(len(desired_features)):
        count = len(re.findall(desired_features[i], s, re.I))
        features.append(count)

    return features

In [7]:
def write_feats_to_file(filename, ids, feat_counts, classes):
    with open(filename, "w") as f:
        csfeats = ",".join(desired_features)
        f.write("id," + csfeats + ",classnum\n")
        for i in range(len(ids)):
            csfeatcounts = ",".join(map(str, feat_counts[i]))
            f.write(str(ids[i]) + "," + str(csfeatcounts) + "," + str(classes[i]) + "\n")

In [8]:
## Feature extraction
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)
#     X_train, t_train, train_ids = create_data_matrix(0, 10, TRAIN_DIR)

    print 'Data matrix (training set):'
    print X_train.shape[0]
    print 'Classes (training set):'
    print t_train

    write_feats_to_file("data/featuresFromStrings.csv", train_ids, X_train, t_train)

In [11]:
main()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Data matrix (training set):
3086
Classes (training set):
[ 8  6 12 ...,  8  8  3]
