In [6]:
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score, classification_report

name = "wild"
with open("features/features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

data = pd.read_csv("features/features_" + name + ".csv")
labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')

names = data.loc[:,"name"]
y = data.loc[:,"label"]
X = data.drop(["name","label","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [7]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_dynamic_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 3} with a score of 0.86314


In [8]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.95000
The balance accuracy is 0.86583
The precision is 0.96722
The recall is 0.93338
              precision    recall  f1-score   support

       False       0.65      0.80      0.72       233
        True       0.97      0.93      0.95      1486

    accuracy                           0.92      1719
   macro avg       0.81      0.87      0.83      1719
weighted avg       0.92      0.92      0.92      1719



In [9]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_dynamic_RF.pickle", "wb"))

The best parameters are {'criterion': 'gini', 'max_depth': 12, 'min_samples_leaf': 2, 'n_estimators': 24} with a score of 0.87656


In [10]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.96798
The balance accuracy is 0.85732
The precision is 0.95966
The recall is 0.97645
              precision    recall  f1-score   support

       False       0.83      0.74      0.78       233
        True       0.96      0.98      0.97      1486

    accuracy                           0.94      1719
   macro avg       0.90      0.86      0.87      1719
weighted avg       0.94      0.94      0.94      1719



In [11]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_dynamic_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 7, 'min_samples_leaf': 8, 'n_estimators': 40} with a score of 0.87629


In [12]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.97191
The balance accuracy is 0.87945
The precision is 0.96609
The recall is 0.97779
              precision    recall  f1-score   support

       False       0.85      0.78      0.81       233
        True       0.97      0.98      0.97      1486

    accuracy                           0.95      1719
   macro avg       0.91      0.88      0.89      1719
weighted avg       0.95      0.95      0.95      1719



In [13]:
with open("features/static_features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.pop(0)

data = pd.read_csv("features/features_" + name + ".csv")
labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')
data_static = pd.read_csv("features/static_features_" + name + ".csv")
data_static = data_static.merge(data[["name","label"]], on='name')
y = data_static.loc[:,"label"]
X = data_static.drop(["name", "label"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [14]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_static_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 2} with a score of 0.82573


In [15]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.94049
The balance accuracy is 0.82746
The precision is 0.95619
The recall is 0.92530
              precision    recall  f1-score   support

       False       0.60      0.73      0.66       233
        True       0.96      0.93      0.94      1486

    accuracy                           0.90      1719
   macro avg       0.78      0.83      0.80      1719
weighted avg       0.91      0.90      0.90      1719



In [16]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_static_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 3, 'n_estimators': 10} with a score of 0.78256


In [17]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.95458
The balance accuracy is 0.76900
The precision is 0.93428
The recall is 0.97577
              precision    recall  f1-score   support

       False       0.78      0.56      0.65       233
        True       0.93      0.98      0.95      1486

    accuracy                           0.92      1719
   macro avg       0.86      0.77      0.80      1719
weighted avg       0.91      0.92      0.91      1719



In [18]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_static_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 9, 'min_samples_leaf': 8, 'n_estimators': 40} with a score of 0.82332


In [19]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.96316
The balance accuracy is 0.82513
The precision is 0.95023
The recall is 0.97645
              precision    recall  f1-score   support

       False       0.82      0.67      0.74       233
        True       0.95      0.98      0.96      1486

    accuracy                           0.94      1719
   macro avg       0.88      0.83      0.85      1719
weighted avg       0.93      0.94      0.93      1719



In [20]:
with open("features/static_features_" + name + ".csv", "r") as f:
  line = f.readline()
  static_feature_names = line.split(",")
  static_feature_names.pop(0)

with open("features/features_" + name + ".csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

feature_names = np.concatenate((feature_names,static_feature_names),axis=None)

data = pd.read_csv("features/features_" + name + ".csv")
data_static = pd.read_csv("features/static_features_" + name + ".csv")
data = data.merge(data_static, on='name')

labels = pd.read_csv("labels/labels_" + name + ".csv",header=None, names=['name', 'label'])
data = data.merge(labels, on='name')

names = data.loc[:,"name"]
y = data.loc[:,"label"]
X = data.drop(["name","label","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [21]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_both_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 4} with a score of 0.87534


In [22]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.95970
The balance accuracy is 0.89335
The precision is 0.97434
The recall is 0.94549
              precision    recall  f1-score   support

       False       0.71      0.84      0.77       233
        True       0.97      0.95      0.96      1486

    accuracy                           0.93      1719
   macro avg       0.84      0.89      0.86      1719
weighted avg       0.94      0.93      0.93      1719



In [23]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_both_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 2, 'n_estimators': 35} with a score of 0.89332


In [24]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.97615
The balance accuracy is 0.90735
The precision is 0.97451
The recall is 0.97779
              precision    recall  f1-score   support

       False       0.86      0.84      0.85       233
        True       0.97      0.98      0.98      1486

    accuracy                           0.96      1719
   macro avg       0.91      0.91      0.91      1719
weighted avg       0.96      0.96      0.96      1719



In [25]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/wild_both_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 12, 'min_samples_leaf': 8, 'n_estimators': 40} with a score of 0.89398


In [26]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.97577
The balance accuracy is 0.91063
The precision is 0.97577
The recall is 0.97577
              precision    recall  f1-score   support

       False       0.85      0.85      0.85       233
        True       0.98      0.98      0.98      1486

    accuracy                           0.96      1719
   macro avg       0.91      0.91      0.91      1719
weighted avg       0.96      0.96      0.96      1719

