In [1]:
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score, classification_report

name = "wild"
with open("features/features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

data = pd.read_csv("features/features_" + name + ".csv")
labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')

names = data.loc[:,"name"]
y = data.loc[:,"label"]
X = data.drop(["name","label","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
    y_train1, y_test1 = y.iloc[train_index], y.iloc[test_index]
    
packed = pd.read_csv("features/features_packed.csv")
notpacked = pd.read_csv("features/features_notpacked.csv")
packed1 = pd.read_csv("features/static_features_packed.csv")
notpacked1 = pd.read_csv("features/static_features_notpacked.csv")
packed = packed.merge(packed1[["name"]], on='name')
notpacked = notpacked.merge(notpacked1[["name"]], on='name')
X_clean = pd.concat([packed,notpacked], ignore_index = True)
X_clean = X_clean.drop(["name","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)
y_clean = pd.DataFrame(np.append([True for i in range(len(packed))],[False for i in range(len(notpacked))])).iloc[:,0]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X_clean, y_clean):
    X_train2, X_test2 = X_clean.iloc[train_index], X_clean.iloc[test_index]
    y_train2, y_test2 = y_clean.iloc[train_index], y_clean.iloc[test_index]


X_train = pd.concat([X_train1,X_train2], ignore_index = True)
y_train = pd.concat([y_train1,y_train2], ignore_index = True)
X_test = pd.concat([X_test1,X_test2], ignore_index = True)
y_test = pd.concat([y_test1,y_test2], ignore_index = True)
X_train, y_train = shuffle(X_train, y_train, random_state=0)
X_test, y_test = shuffle(X_test, y_test, random_state=0)

In [2]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_dynamic_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 3} with a score of 0.90143


In [3]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.96655
The balance accuracy is 0.89661
The precision is 0.97229
The recall is 0.96088
              precision    recall  f1-score   support

       False       0.78      0.83      0.80       334
        True       0.97      0.96      0.97      2045

    accuracy                           0.94      2379
   macro avg       0.87      0.90      0.89      2379
weighted avg       0.94      0.94      0.94      2379

The F1-score is 0.95916
The balance accuracy is 0.88182
The precision is 0.97039
The recall is 0.94818
              precision    recall  f1-score   support

       False       0.71      0.82      0.76       233
        True       0.97      0.95      0.96      1486

    accuracy                           0.93      1719
   macro avg       0.84      0.88      0.86      1719
weighted avg       0.94      0.93      0.93      1719

The F1-score is 0.98582
The balance accuracy is 0.93296
The precision is 0.97715
The recall is 0.99463
              precision    recall  f1

In [4]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_dynamic_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 20} with a score of 0.91087


In [5]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.97517
The balance accuracy is 0.89991
The precision is 0.97092
The recall is 0.97946
              precision    recall  f1-score   support

       False       0.87      0.82      0.84       334
        True       0.97      0.98      0.98      2045

    accuracy                           0.96      2379
   macro avg       0.92      0.90      0.91      2379
weighted avg       0.96      0.96      0.96      2379

The F1-score is 0.96815
The balance accuracy is 0.87213
The precision is 0.96460
The recall is 0.97174
              precision    recall  f1-score   support

       False       0.81      0.77      0.79       233
        True       0.96      0.97      0.97      1486

    accuracy                           0.94      1719
   macro avg       0.89      0.87      0.88      1719
weighted avg       0.94      0.94      0.94      1719

The F1-score is 0.99378
The balance accuracy is 0.96535
The precision is 0.98763
The recall is 1.00000
              precision    recall  f1

In [6]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_dynamic_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 12, 'min_samples_leaf': 10, 'n_estimators': 40} with a score of 0.90770


In [7]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.97785
The balance accuracy is 0.90886
The precision is 0.97335
The recall is 0.98240
              precision    recall  f1-score   support

       False       0.89      0.84      0.86       334
        True       0.97      0.98      0.98      2045

    accuracy                           0.96      2379
   macro avg       0.93      0.91      0.92      2379
weighted avg       0.96      0.96      0.96      2379

The F1-score is 0.97283
The balance accuracy is 0.89132
The precision is 0.96990
The recall is 0.97577
              precision    recall  f1-score   support

       False       0.84      0.81      0.82       233
        True       0.97      0.98      0.97      1486

    accuracy                           0.95      1719
   macro avg       0.90      0.89      0.90      1719
weighted avg       0.95      0.95      0.95      1719

The F1-score is 0.99113
The balance accuracy is 0.95050
The precision is 0.98243
The recall is 1.00000
              precision    recall  f1

In [8]:
with open("features/static_features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.pop(0)

data = pd.read_csv("features/features_" + name + ".csv")
labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')
data_static = pd.read_csv("features/static_features_" + name + ".csv")
data_static = data_static.merge(data[["name","label"]], on='name')
y = data_static.loc[:,"label"]
X = data_static.drop(["name", "label"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
    y_train1, y_test1 = y.iloc[train_index], y.iloc[test_index]
    
packed1 = pd.read_csv("features/features_packed.csv")
notpacked1 = pd.read_csv("features/features_notpacked.csv")
packed = pd.read_csv("features/static_features_packed.csv")
notpacked = pd.read_csv("features/static_features_notpacked.csv")
packed = packed.merge(packed1[["name"]], on='name')
notpacked = notpacked.merge(notpacked1[["name"]], on='name')
X_clean = pd.concat([packed,notpacked], ignore_index = True)
X_clean = X_clean.drop(["name"], axis = 1)
y_clean = pd.DataFrame(np.append([True for i in range(len(packed))],[False for i in range(len(notpacked))])).iloc[:,0]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X_clean, y_clean):
    X_train2, X_test2 = X_clean.iloc[train_index], X_clean.iloc[test_index]
    y_train2, y_test2 = y_clean.iloc[train_index], y_clean.iloc[test_index]


X_train = pd.concat([X_train1,X_train2], ignore_index = True)
y_train = pd.concat([y_train1,y_train2], ignore_index = True)
X_test = pd.concat([X_test1,X_test2], ignore_index = True)
y_test = pd.concat([y_test1,y_test2], ignore_index = True)
X_train, y_train = shuffle(X_train, y_train, random_state=0)
X_test, y_test = shuffle(X_test, y_test, random_state=0)

In [9]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_static_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 2} with a score of 0.86409


In [10]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.95691
The balance accuracy is 0.86578
The precision is 0.96379
The recall is 0.95012
              precision    recall  f1-score   support

       False       0.72      0.78      0.75       334
        True       0.96      0.95      0.96      2045

    accuracy                           0.93      2379
   macro avg       0.84      0.87      0.85      2379
weighted avg       0.93      0.93      0.93      2379

The F1-score is 0.94568
The balance accuracy is 0.84336
The precision is 0.96044
The recall is 0.93136
              precision    recall  f1-score   support

       False       0.63      0.76      0.69       233
        True       0.96      0.93      0.95      1486

    accuracy                           0.91      1719
   macro avg       0.80      0.84      0.82      1719
weighted avg       0.92      0.91      0.91      1719

The F1-score is 0.98589
The balance accuracy is 0.92079
The precision is 0.97217
The recall is 1.00000
              precision    recall  f1

In [11]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_static_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 2, 'n_estimators': 39} with a score of 0.84392


In [12]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.96863
The balance accuracy is 0.85298
The precision is 0.95617
The recall is 0.98142
              precision    recall  f1-score   support

       False       0.86      0.72      0.79       334
        True       0.96      0.98      0.97      2045

    accuracy                           0.95      2379
   macro avg       0.91      0.85      0.88      2379
weighted avg       0.94      0.95      0.94      2379

The F1-score is 0.96277
The balance accuracy is 0.82842
The precision is 0.95138
The recall is 0.97443
              precision    recall  f1-score   support

       False       0.81      0.68      0.74       233
        True       0.95      0.97      0.96      1486

    accuracy                           0.93      1719
   macro avg       0.88      0.83      0.85      1719
weighted avg       0.93      0.93      0.93      1719

The F1-score is 0.98415
The balance accuracy is 0.91089
The precision is 0.96880
The recall is 1.00000
              precision    recall  f1

In [13]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_static_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 11, 'min_samples_leaf': 9, 'n_estimators': 40} with a score of 0.86813


In [14]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.96830
The balance accuracy is 0.85900
The precision is 0.95833
The recall is 0.97848
              precision    recall  f1-score   support

       False       0.85      0.74      0.79       334
        True       0.96      0.98      0.97      2045

    accuracy                           0.94      2379
   macro avg       0.90      0.86      0.88      2379
weighted avg       0.94      0.94      0.94      2379

The F1-score is 0.96197
The balance accuracy is 0.83498
The precision is 0.95370
The recall is 0.97039
              precision    recall  f1-score   support

       False       0.79      0.70      0.74       233
        True       0.95      0.97      0.96      1486

    accuracy                           0.93      1719
   macro avg       0.87      0.83      0.85      1719
weighted avg       0.93      0.93      0.93      1719

The F1-score is 0.98502
The balance accuracy is 0.91584
The precision is 0.97049
The recall is 1.00000
              precision    recall  f1

In [15]:
with open("features/static_features_" + name + ".csv", "r") as f:
  line = f.readline()
  static_feature_names = line.split(",")
  static_feature_names.pop(0)

with open("features/features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

feature_names = np.concatenate((feature_names,static_feature_names),axis=None)

data = pd.read_csv("features/features_" + name + ".csv")
data_static = pd.read_csv("features/static_features_" + name + ".csv")
data = data.merge(data_static, on='name')

labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')

names = data.loc[:,"name"]
y = data.loc[:,"label"]
X = data.drop(["name","label","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
    y_train1, y_test1 = y.iloc[train_index], y.iloc[test_index]
    

packed = pd.read_csv("features/features_packed.csv")
notpacked = pd.read_csv("features/features_notpacked.csv")
packed1 = pd.read_csv("features/static_features_packed.csv")
notpacked1 = pd.read_csv("features/static_features_notpacked.csv")
packed = packed.merge(packed1, on='name')
notpacked = notpacked.merge(notpacked1, on='name')
X_clean = pd.concat([packed,notpacked], ignore_index = True)
X_clean = X_clean.drop(["name","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)
y_clean = pd.DataFrame(np.append([True for i in range(len(packed))],[False for i in range(len(notpacked))])).iloc[:,0]


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X_clean, y_clean):
    X_train2, X_test2 = X_clean.iloc[train_index], X_clean.iloc[test_index]
    y_train2, y_test2 = y_clean.iloc[train_index], y_clean.iloc[test_index]


X_train = pd.concat([X_train1,X_train2], ignore_index = True)
y_train = pd.concat([y_train1,y_train2], ignore_index = True)
X_test = pd.concat([X_test1,X_test2], ignore_index = True)
y_test = pd.concat([y_test1,y_test2], ignore_index = True)
X_train, y_train = shuffle(X_train, y_train, random_state=0)
X_test, y_test = shuffle(X_test, y_test, random_state=0)

In [16]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_both_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 10} with a score of 0.92660


In [17]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.96695
The balance accuracy is 0.90586
The precision is 0.97561
The recall is 0.95844
              precision    recall  f1-score   support

       False       0.77      0.85      0.81       334
        True       0.98      0.96      0.97      2045

    accuracy                           0.94      2379
   macro avg       0.87      0.91      0.89      2379
weighted avg       0.95      0.94      0.94      2379

The F1-score is 0.95664
The balance accuracy is 0.88127
The precision is 0.97089
The recall is 0.94280
              precision    recall  f1-score   support

       False       0.69      0.82      0.75       233
        True       0.97      0.94      0.96      1486

    accuracy                           0.93      1719
   macro avg       0.83      0.88      0.85      1719
weighted avg       0.93      0.93      0.93      1719

The F1-score is 0.99378
The balance accuracy is 0.96535
The precision is 0.98763
The recall is 1.00000
              precision    recall  f1

In [18]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_both_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 2, 'n_estimators': 28} with a score of 0.92429


In [19]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.97923
The balance accuracy is 0.92411
The precision is 0.97852
The recall is 0.97995
              precision    recall  f1-score   support

       False       0.88      0.87      0.87       334
        True       0.98      0.98      0.98      2045

    accuracy                           0.96      2379
   macro avg       0.93      0.92      0.93      2379
weighted avg       0.96      0.96      0.96      2379

The F1-score is 0.97306
The balance accuracy is 0.90251
The precision is 0.97372
The recall is 0.97241
              precision    recall  f1-score   support

       False       0.83      0.83      0.83       233
        True       0.97      0.97      0.97      1486

    accuracy                           0.95      1719
   macro avg       0.90      0.90      0.90      1719
weighted avg       0.95      0.95      0.95      1719

The F1-score is 0.99555
The balance accuracy is 0.97525
The precision is 0.99113
The recall is 1.00000
              precision    recall  f1

In [20]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/mix_both_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 9, 'min_samples_leaf': 9, 'n_estimators': 40} with a score of 0.93351


In [21]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

y_pred1 = model.predict(X_test1)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test1,y_pred1))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test1,y_pred1))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test1,y_pred1))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test1,y_pred1))
)
print(classification_report(y_test1,y_pred1))

y_pred2 = model.predict(X_test2)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test2,y_pred2))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test2,y_pred2))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test2,y_pred2))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test2,y_pred2))
)
print(classification_report(y_test2,y_pred2))

The F1-score is 0.98143
The balance accuracy is 0.93257
The precision is 0.98095
The recall is 0.98191
              precision    recall  f1-score   support

       False       0.89      0.88      0.89       334
        True       0.98      0.98      0.98      2045

    accuracy                           0.97      2379
   macro avg       0.93      0.93      0.93      2379
weighted avg       0.97      0.97      0.97      2379

The F1-score is 0.97642
The balance accuracy is 0.91674
The precision is 0.97773
The recall is 0.97510
              precision    recall  f1-score   support

       False       0.84      0.86      0.85       233
        True       0.98      0.98      0.98      1486

    accuracy                           0.96      1719
   macro avg       0.91      0.92      0.91      1719
weighted avg       0.96      0.96      0.96      1719

The F1-score is 0.99466
The balance accuracy is 0.97030
The precision is 0.98938
The recall is 1.00000
              precision    recall  f1