In [1]:
import os
import csv 
import sys 
import numpy as np
from scipy import sparse
from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET


from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import util



In [2]:
def write_to_file(rows, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        for r in rows:
            f.write("%s\n" % (r))

In [3]:
def reorder_submission(file_to_reorder, newfile_name = "experiment_results.csv"):
    # READ IN KEYS IN CORRECT ORDER AS LIST
    with open('keys.csv','r') as f:
        keyreader = csv.reader(f)
        keys = [key[0] for key in keyreader]

    # READ IN ALL PREDICTIONS, REGARDLESS OF ORDER
    with open(file_to_reorder) as f:
        oldfile_reader = csv.reader(f)
        D = {}
        for i,row in enumerate(oldfile_reader):
            if i == 0:
                continue
            _id, pred = row 
            D[_id] = pred

    # WRITE PREDICTIONS IN NEW ORDER
    with open(newfile_name,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('Id','Prediction'))
        for key in keys:
            writer.writerow((key,D[key]))

    print("".join(["Reordered ", file_to_reorder," and wrote to ", newfile_name]))

In [4]:
from keras.layers import Dense # For dense layers
from keras.models import Sequential # For sequential layering
from keras.callbacks import EarlyStopping # For stopping execution
from sklearn.metrics import accuracy_score

def train_model_DL(X_train, Y_train, n_nodes, n_output = 1, hid_activation = 'sigmoid', out_activation = 'softmax', optimizer = 'adam', loss = 'categorical_crossentropy'):
    """ n_nodes is 1-D numpy array with number of nodes on each layer
        e.g. [10,20,30] is a model with 3 (hidden) layers,
        with 10/20/30 nodes on the first/second/third layers
        Returns trained DL model """
    input_shape = (X_train.shape[1],) # Shape of input data
    # Initialize model
    model_DL = Sequential()
    for i in range(len(n_nodes)):
        if i == 0:
            # First layer
            model_DL.add(Dense(n_nodes[i], activation = hid_activation, input_shape = input_shape))
        else:
            # Subsequent layers
            model_DL.add(Dense(n_nodes[i],activation = hid_activation))
    # Output layer
    model_DL.add(Dense(n_output, activation = out_activation))
    # Compile model
    model_DL.compile(optimizer = optimizer,loss = loss, metrics=['accuracy'])
    # Print model summary
    model_DL.summary()
    # Early stopping monitor w/ patience=3 (stop after 3 runs without improvements)
    early_stopping_monitor = EarlyStopping(patience=10)
    # Fit model using 20% of data for validation
    model_DL.fit(X_train, Y_train, validation_split=0.2, epochs=200, callbacks=[early_stopping_monitor])
    return model_DL

Using TensorFlow backend.


In [5]:
import ntpath
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [6]:
from urlparse import urlparse
def url_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri)
    return domain

In [7]:
def get_classes_ids(direc):
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
    return np.array(classes), ids

In [8]:
def get_tokens(direc, useFirstThreadOnly):
    tags_set = []
    values_set = []
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        in_all_section = False
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        tags = ""
        values = ""
        for el in tree.iter():
            # ignore everything outside the "all_section" element
            if el.tag == "all_section" and not in_all_section:
                in_all_section = True
            elif el.tag == "all_section" and in_all_section:
                in_all_section = False
                if useFirstThreadOnly is True:
                    break
            elif in_all_section:
                tags += " " + el.tag.replace('_', '')
                for k, v in el.attrib.iteritems():
                    if ("hash" and "id" and "index" and "size" and "time") not in k:
                        if "file" in k:
                            values += " " + path_leaf(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                        elif "url" in k:
                            values += " " + url_domain(v).replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                        elif ("key" or "name" or "target" or "command" or "socket" or "value") in k:
                            values += " " + v.replace('.', '').replace('$', '').replace('_', '').replace('-', '')
                
        tags_set.append(tags)
        values_set.append(values)
    
    assert len(tags_set) == len(values_set)
    return tags_set, values_set, np.array(classes), ids

In [9]:
def get_tags_bigram(tags_set):
    tags_bigram_set = []
    for tags_entry in tags_set:
        tags = tags_entry.split(' ')
        pre_tag = ""
        tags_bigram = ""
        for tag in tags:
            if pre_tag is not "":
                tags_bigram += " " + pre_tag + tag
            pre_tag = tag
        tags_bigram_set.append(tags_bigram)
    return tags_bigram_set

In [10]:
def get_tags_3gram(tags_set):
    tags_3gram_set = []
    for tags_entry in tags_set:
        tags = tags_entry.split(' ')
        first_tag = ""
        second_tag = ""
        tags_3gram = ""
        for tag in tags:
            if first_tag is not "":
                tags_3gram += " " + first_tag + second_tag+tag
            first_tag = second_tag
            second_tag = tag
        tags_3gram_set.append(tags_3gram)
    return tags_3gram_set

In [11]:
def get_tags_ngram(tags_set, n):
    tags_ngram_set = []
    for tags_entry in tags_set:
        tags = tags_entry.split(' ')
        tags_ngram = ""
        for i in range(0, len(tags)-9):
            tags_ngram += " "
            for j in range(0,10):
                tags_ngram += tags[j+i]
        tags_ngram_set.append(tags_ngram)
    return tags_ngram_set

In [12]:
def get_all_tokens(direc, useFirstThreadOnly = False):
    tags_set, values_set, classes, ids = get_tokens(direc, useFirstThreadOnly)
    tags_bigram = get_tags_bigram(tags_set)
    tags_3gram = get_tags_3gram(tags_set)
    tags_ngram = get_tags_ngram(tags_set, 10)
    
    assert len(tags_set) == len(tags_ngram)
    tokens_set = []
    for i in range(len(tags_set)):
        tokens_set.append(tags_set[i] + ' ' + tags_bigram[i] + ' ' + tags_3gram[i] + ' ' + tags_ngram[i] + ' ' + values_set[i])
    return tokens_set, classes, ids

In [13]:
def get_file_feature(direc):
    filesizes = []
    process_size = []
    thread_size = []
    for datafile in os.listdir(direc):
        filesizes = np.append(filesizes, os.path.getsize(os.path.join(direc,datafile))/1000)
        tree = ET.parse(os.path.join(direc,datafile))
        n_process = 0
        n_thread = 0
        for el in tree.iter():
            if el.tag == "process":
                n_process += 1
            elif el.tag == "thread":
                n_thread += 1
        process_size.append(n_process)
        thread_size.append(n_thread)
        
    filesizes = np.array(filesizes)
    filesizes = np.true_divide(filesizes, np.std(filesizes))
    process_size = np.array(process_size)
    process_size = np.true_divide(process_size, np.std(process_size))
    thread_size = np.array(thread_size)
    thread_size = np.true_divide(thread_size, np.std(thread_size))

    return np.asmatrix(np.array((filesizes, process_size, thread_size))).T

In [14]:
#############################
#
#  Below is main function
#
#############################

In [15]:
train_dir = "../train_origin"
test_dir = "../test_origin"

In [16]:
train_tokens, train_classes, train_ids = get_all_tokens(train_dir)

In [17]:
write_to_file(train_classes, "train_classes.csv")
write_to_file(train_ids, "train_ids.csv")

In [18]:
test_tokens, _, ids = get_all_tokens(test_dir)

In [19]:
write_to_file(ids, "test_ids.csv")

In [20]:
#train_classes, _ = get_classes_ids(train_dir)
#_, ids = get_classes_ids(test_dir)

In [21]:
y_train = np.zeros((len(train_classes),len(util.malware_classes)))
y_train[np.arange(len(train_classes)), train_classes] = 1
y_train.shape

(3086, 15)

In [22]:
# Update util.py
assert len(util.malware_classes) + 1 == len(util.major_classes) + len(util.minor_classes)
y_train_major = np.zeros((len(train_classes),len(util.major_classes)))
y_train_minor = np.zeros((len(train_classes),len(util.minor_classes)))
for i in range(0, len(train_classes)):
    classname = util.malware_classes[train_classes[i]]
    if classname in util.major_classes:
        y_train_major[i, util.major_classes.index(classname)] = 1
    else:
        y_train_major[i, util.major_classes.index("Others")] = 1
        y_train_minor[i, util.minor_classes.index(classname)] = 1

In [23]:
TF = TfidfVectorizer(analyzer = 'word')

In [24]:
X_train = TF.fit_transform(train_tokens)
print X_train.shape

(3086, 159889)


In [25]:
X_test = TF.transform(test_tokens)
print X_test.shape

(3724, 159889)


In [26]:
sparse.save_npz("tfidf_train_tokens.npz", X_train)
sparse.save_npz("tfidf_test_tokens.npz", X_test)

In [27]:
train_features = get_file_feature(train_dir)
test_features = get_file_feature(test_dir)
print train_features.shape
print test_features.shape

(3086, 3)
(3724, 3)


In [28]:
X_train = sparse.hstack([X_train, train_features])
X_test = sparse.hstack([X_test, test_features])

In [29]:
print X_train.shape
print X_test.shape

(3086, 159892)
(3724, 159892)


In [30]:
########################
#
# Finish collecting data
#
########################

In [31]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 10, n_jobs = -1)
RF.fit(X_train, y_train_major)
scores = cross_val_score(RF, X_train, y_train_major, cv=5)
print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
RF_best = RF
score_best = scores.mean()
X_train_major_best = X_train
X_test_major_best = X_test

Features: 159892	Accuracy: 0.89047 (+/- 0.02523)


In [32]:
X_train_major = X_train
X_test_major = X_test

In [33]:
from sklearn.feature_selection import SelectFromModel
while X_train_major.shape[1] > 100:
    model = SelectFromModel(RF, prefit=True, threshold = "0.5*mean")
    X_train_major = model.transform(X_train_major)
    ## trick: break if we didn't remove any feature
    if X_train_major.shape[1] == X_test_major.shape[1]:
        break
    X_test_major = model.transform(X_test_major)
    
    RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
    RF.fit(X_train_major, y_train_major)
    scores = cross_val_score(RF, X_train_major, y_train_major, cv=5)
    mean_score = scores.mean()
    print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (mean_score, scores.std() * 2))
    
    if score_best <= mean_score:
        del X_train_major_best
        del X_test_major_best
        RF_best = RF
        score_best = mean_score
        X_train_major_best = X_train_major
        X_test_major_best = X_test_major

Features: 1656	Accuracy: 0.90278 (+/- 0.03174)
Features: 470	Accuracy: 0.90019 (+/- 0.02427)
Features: 224	Accuracy: 0.89954 (+/- 0.02118)
Features: 131	Accuracy: 0.90570 (+/- 0.03117)
Features: 92	Accuracy: 0.90440 (+/- 0.03756)


In [34]:
sparse.save_npz("tfidf_train_major_features.npz", X_train_major_best)
sparse.save_npz("tfidf_test_major_features.npz", X_test_major_best)

In [35]:
# X_train_major_best = sparse.load_npz("tfidf_train_major_features.npz")
# X_test_major_best = sparse.load_npz("tfidf_test_major_features.npz")

In [36]:
from sklearn.ensemble import RandomForestClassifier
RF_major = RandomForestClassifier(n_estimators = 100, n_jobs = -1, max_features = None)
RF_major.fit(X_train_major_best, y_train_major)
scores = cross_val_score(RF_major, X_train_major_best, y_train_major, cv=5)
print "Features: " + str(RF_major.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Features: 131	Accuracy: 0.89987 (+/- 0.03900)


In [37]:
preds_RF_major = RF_major.predict(X_test_major_best)
results_RF_major = np.argmax(preds_RF_major, axis=1)
print results_RF_major

[1 0 0 ..., 3 0 0]


In [38]:
model_DL_major = train_model_DL(X_train_major_best,y_train_major,[100], 4)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               13200     
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 404       
Total params: 13,604
Trainable params: 13,604
Non-trainable params: 0
_________________________________________________________________
Train on 2468 samples, validate on 618 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Ep

Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200


Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200


Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [39]:
Y_train_DLpred = model_DL_major.predict(X_train_major_best)
results_train = np.argmax(Y_train_DLpred, axis=1)
acc_DL = accuracy_score(np.argmax(y_train_major, axis=1), results_train)
print('DONE. Accuracy: preds_DL_major = model_DL_m', acc_DL)

('DONE. Accuracy: preds_DL_major = model_DL_m', 0.89630589760207391)


In [40]:
preds_DL_major = model_DL_major.predict(X_test_major_best)
results_DL_major = np.argmax(preds_DL_major, axis=1)
print results_DL_major

[1 0 0 ..., 3 0 3]


In [41]:
#### second step ####

In [42]:
print y_train_minor.shape

(3086, 12)


In [43]:
y_train_minor_bkup = y_train_minor

In [44]:
# remove 0 rows (None, Swizzor, VB)
ix = np.sum(y_train_minor, axis=1) > 0

X_train_minor = X_train.tocsr()[ix, :]
y_train_minor = y_train_minor[ix, :]
X_test_minor = X_test

In [45]:
print X_train_minor.shape
print y_train_minor.shape
print X_test_minor.shape

(559, 159892)
(559, 12)
(3724, 159892)


In [46]:
RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, max_features = None)
RF.fit(X_train_minor, y_train_minor)
scores = cross_val_score(RF, X_train_minor, y_train_minor, cv=5)
print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
RF_best = RF
score_best = scores.mean()
X_train_minor_best = X_train_minor
X_test_minor_best = X_test

Features: 159892	Accuracy: 0.66366 (+/- 0.02799)


In [47]:
while X_train_minor.shape[1] > 100:
    model = SelectFromModel(RF, prefit=True, threshold = "0.5*mean")
    X_train_minor = model.transform(X_train_minor)
    ## trick: break if we didn't remove any feature
    if X_train_minor.shape[1] == X_test_minor.shape[1]:
        break
    X_test_minor = model.transform(X_test_minor)
    
    RF = RandomForestClassifier(n_estimators = 100, n_jobs = -1, max_features = None)
    RF.fit(X_train_minor, y_train_minor)
    scores = cross_val_score(RF, X_train_minor, y_train_minor, cv=5)
    mean_score = scores.mean()
    print "Features: " + str(RF.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (mean_score, scores.std() * 2))
    
    if score_best < mean_score:
        del X_train_minor_best
        del X_test_minor_best
        RF_best = RF
        score_best = mean_score
        X_train_minor_best = X_train_minor
        X_test_minor_best = X_test_minor

Features: 2706	Accuracy: 0.67799 (+/- 0.02997)
Features: 739	Accuracy: 0.68156 (+/- 0.03345)
Features: 369	Accuracy: 0.67262 (+/- 0.01882)
Features: 238	Accuracy: 0.67798 (+/- 0.02363)
Features: 184	Accuracy: 0.67978 (+/- 0.01386)
Features: 146	Accuracy: 0.67439 (+/- 0.03029)
Features: 124	Accuracy: 0.66009 (+/- 0.02369)
Features: 112	Accuracy: 0.67083 (+/- 0.01388)
Features: 98	Accuracy: 0.66902 (+/- 0.03144)


In [48]:
sparse.save_npz("tfidf_train_minor_features.npz", X_train_minor_best)
sparse.save_npz("tfidf_test_minor_features.npz", X_test_minor_best)

In [49]:
RF_minor = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, max_features = None)
RF_minor.fit(X_train_minor_best, y_train_minor)
scores = cross_val_score(RF_minor, X_train_minor_best, y_train_minor, cv=4)
print "Features: " + str(RF_minor.n_features_) + ("\tAccuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

Features: 739	Accuracy: 0.67797 (+/- 0.02194)


In [50]:
results_RF_minor.shape

NameError: name 'results_RF_minor' is not defined

In [None]:
preds_RF_minor = RF_minor.predict(X_test_minor_best)
results_RF_minor = np.argmax(preds_RF_minor, axis=1)
print results_RF_minor

In [None]:
# X_train_minor_best = sparse.load_npz("tfidf_train_minor_features.npz")
# X_test_minor_best = sparse.load_npz("tfidf_test_minor_features.npz")

In [None]:
print X_train_minor_best.shape
print X_test_minor_best.shape

In [None]:
results_RF = []
for i in range(0, len(results_RF_major)):
    if results_RF_major[i] == 3:
        results_RF.append(results_RF_minor[i] + 3)
    else:
        results_RF.append(results_RF_major[i])

# resume the index
for i in range(0, len(results_RF)):
    results_RF[i] = util.origin_malware_classes.index(util.malware_classes[results_RF[i]])
print len(results_RF)

In [None]:
util.write_predictions(results_RF, ids, "tdidf_2steps_rf.csv")
reorder_submission("tdidf_2steps_rf.csv", "all_seleted_2steps_rf_results_final.csv")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(results_RF, bins = range(0, 15), normed=True)
plt.show()

In [None]:
model_DL_minor = train_model_DL(X_train_minor_best,y_train_minor,[128], 12)

In [None]:
Y_train_DLpred = model_DL_minor.predict(X_train_minor_best)
results_train = np.argmax(Y_train_DLpred, axis=1)
acc_DL = accuracy_score(np.argmax(y_train_minor, axis=1), results_train)
print('DONE. Accuracy: ', acc_DL)

In [None]:
preds_DL_minor = model_DL_minor.predict(X_test_minor_best)
results_DL_minor = np.argmax(preds_DL_minor, axis=1)
print results_DL_minor

In [None]:
results_DL = []
for i in range(0, len(results_RF_major)):
    if results_RF_major[i] == 3:
        results_DL.append(results_DL_minor[i] + 3)
    else:
        results_DL.append(results_RF_major[i])

# resume the index
for i in range(0, len(results_DL)):
    results_DL[i] = util.origin_malware_classes.index(util.malware_classes[results_DL[i]])
print len(results_DL)

In [None]:
util.write_predictions(results_DL, ids, "tdidf_2steps_dl.csv")
reorder_submission("tdidf_2steps_dl.csv", "all_seleted_2steps_dl_results_final.csv")

In [None]:
plt.hist(results_DL, bins = range(0, 15), normed=True)
plt.show()

In [None]:
model_DL_major.save('model_DL_major.h5')
model_DL_minor.save('model_DL_minor.h5')