In [1]:
import os
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET


from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util



In [11]:
import csv 
import sys 

def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [68]:
import sys
import codecs

def encode(direc):
    for datafile in os.listdir(direc):
        f = codecs.open(direc+'/'+datafile, 'r')
        contents = f.read()
        newcontents = contents.replace('&#','').replace(':\\', '').replace('\\', '').replace('.', '').replace('_', '').replace('-', '').replace(';', '')
        f.close()
        f = codecs.open(direc+'/'+datafile, 'w')
        f.write(newcontents)
        f.close()

In [69]:
encode('../train')

In [70]:
encode('../test')

In [2]:
direc = "../train"
classes = []
ids = [] 
train_files = []
for datafile in os.listdir(direc):
    train_files.append(direc+'/'+datafile)
    # extract id and true class (if available) from filename
    id_str,clazz = datafile.split('.')[:2]
    ids.append(id_str)
    # add target class if this is training data
    classes.append(util.malware_classes.index(clazz))

In [3]:
y_train = np.zeros((len(classes),len(util.malware_classes)))
y_train[np.arange(len(classes)), classes] = 1
y_train.shape

(3086, 15)

In [4]:
stop_words = []
for i in range (0,10000):
    stop_words.append(str(i))

In [5]:
TF = TfidfVectorizer('filename', analyzer = 'word', stop_words = stop_words, min_df = 0.001)

In [6]:
X_train = TF.fit_transform(train_files)

In [112]:
X_train.shape

(3086, 71194)

In [9]:
test_direc = "../test"
test_files = []
test_ids = []
for datafile in os.listdir(test_direc):
    test_files.append(test_direc+'/'+datafile)
    id_str,clazz = datafile.split('.')[:2]
    test_ids.append(id_str)

In [48]:
X_test = TF.transform(test_files)
X_test.shape

(3724, 71194)

In [115]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score=True)
RF.fit(X_train, y_train)
print RF.n_features_
scores = cross_val_score(RF, X_train, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

71194
Accuracy: 0.85612 (+/- 0.02495)


In [116]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(RF, prefit=True)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_train)
X_train_new.shape

(3086, 3799)

In [117]:
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score=True)
RF.fit(X_train_new, y_train)
scores = cross_val_score(RF, X_train_new, y_train, cv=5)
print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.86260 (+/- 0.01786)


In [118]:
while X_train_new.shape[1] > 1:
    model = SelectFromModel(RF, prefit=True)
    X_train_new = model.transform(X_train_new)
    X_test_new = model.transform(X_test_new)
    
    RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score=True)
    RF.fit(X_train_new, y_train)
    scores = cross_val_score(RF, X_train_new, y_train, cv=5)
    print "n_features: " + str(X_train_new.shape[1]) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

n_features: 660	Accuracy: 0.85483 (+/- 0.01775)
n_features: 135	Accuracy: 0.85418 (+/- 0.02741)
n_features: 44	Accuracy: 0.85321 (+/- 0.02259)
n_features: 21	Accuracy: 0.84673 (+/- 0.02339)
n_features: 10	Accuracy: 0.84251 (+/- 0.03379)
n_features: 5	Accuracy: 0.83149 (+/- 0.02882)
n_features: 3	Accuracy: 0.82728 (+/- 0.03005)
n_features: 2	Accuracy: 0.79261 (+/- 0.02222)
n_features: 1	Accuracy: 0.68147 (+/- 0.01256)


In [124]:
#from sklearn.linear_model import SGDClassifier
#SGD = SGDClassifier(loss = 'modified_huber', n_jobs = -1)
#SGD.fit(X_train, classes)
#print SGD.score(X_train, classes)

0.9141283214517174


In [75]:
print "making predictions..."
preds = RF.predict(X_test_new)
results = np.argmax(preds, axis=1)
#results = SGD.predict(X_test)

making predictions...


In [77]:
print results

[10  0  8 ...,  5  6  8]


In [78]:
print "writing predictions..."
outputfile = "tfidf_predictions.csv"
util.write_predictions(results, test_ids, outputfile)

writing predictions...


In [79]:
reorder_submission(outputfile, "tfidf_rf_sfm_results1.csv")

Reordered tfidf_predictions.csv and wrote to tfidf_rf_sfm_results1.csv


In [7]:
sparse.save_npz("tfidf_train.npz", X_train)

In [8]:
# X = sparse.load_npz("tfidf_train.npz")
# X.shape

In [109]:
from sklearn.feature_selection import RFECV
model = RFECV(RF, cv = 5, step = 0.1, scoring='accuracy')
model.fit(X_train_new, classes)
print model.n_features_
print model.score(X_train_new, classes)

#X_train_new = model.transform(X_train)
#X_test_new = model.transform(X_test)
#print X_train_new.shape
#print X_test_new.shape

2367
0.99967595593
