In [2]:
import sys
import shap
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample, shuffle
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score, classification_report

with open("features/features_packed.csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

packed = pd.read_csv("features/features_packed.csv")
notpacked = pd.read_csv("features/features_notpacked.csv")

packed1 = pd.read_csv("features/static_features_packed.csv")
notpacked1 = pd.read_csv("features/static_features_notpacked.csv")

packed = packed.merge(packed1[["name"]], on='name')
notpacked = notpacked.merge(notpacked1[["name"]], on='name')

X = pd.concat([packed,notpacked], ignore_index = True)
X = X.drop(["name","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)
y = np.append(np.ones(len(packed)),np.zeros(len(notpacked)))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [3]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_dynamic_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 2} with a score of 0.98398


In [4]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99465
The balance accuracy is 0.97435
The precision is 0.99112
The recall is 0.99821
              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97       101
         1.0       0.99      1.00      0.99       559

    accuracy                           0.99       660
   macro avg       0.99      0.97      0.98       660
weighted avg       0.99      0.99      0.99       660



In [5]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_dynamic_RF.pickle", "wb"))

The best parameters are {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 2, 'n_estimators': 39} with a score of 0.99366


In [6]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99732
The balance accuracy is 0.98515
The precision is 0.99466
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98       101
         1.0       0.99      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660



In [7]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_dynamic_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 5, 'min_samples_leaf': 12, 'n_estimators': 40} with a score of 0.99101


In [8]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99643
The balance accuracy is 0.98425
The precision is 0.99465
The recall is 0.99821
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       101
         1.0       0.99      1.00      1.00       559

    accuracy                           0.99       660
   macro avg       0.99      0.98      0.99       660
weighted avg       0.99      0.99      0.99       660



In [9]:
with open("features/static_features_packed.csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")

packed = pd.read_csv("features/static_features_packed.csv")
notpacked = pd.read_csv("features/static_features_notpacked.csv")

packed1 = pd.read_csv("features/features_packed.csv")
notpacked1 = pd.read_csv("features/features_notpacked.csv")

packed = packed.merge(packed1[["name"]], on='name')
notpacked = notpacked.merge(notpacked1[["name"]], on='name')

X = pd.concat([packed,notpacked], ignore_index = True)
X = X.drop(["name"], axis = 1)
y = np.append(np.ones(len(packed)),np.zeros(len(notpacked)))


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [10]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_static_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 2} with a score of 0.99092


In [11]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99821
The balance accuracy is 0.99010
The precision is 0.99643
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99       101
         1.0       1.00      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660



In [12]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_static_RF.pickle", "wb"))

The best parameters are {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2, 'n_estimators': 22} with a score of 0.99783


In [13]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99732
The balance accuracy is 0.98515
The precision is 0.99466
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98       101
         1.0       0.99      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660



In [14]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_static_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 3, 'min_samples_leaf': 10, 'n_estimators': 40} with a score of 0.99348


In [15]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 1.00000
The balance accuracy is 1.00000
The precision is 1.00000
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       101
         1.0       1.00      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      1.00      1.00       660
weighted avg       1.00      1.00      1.00       660



In [16]:
with open("features/static_features_packed.csv", "r") as f:
  line = f.readline()
  static_feature_names = line.split(",")
  static_feature_names.pop(0) 

with open("features/features_packed.csv", "r") as f:
  line = f.readline()
  feature_names = line.split(",")
  feature_names.remove("name")
  feature_names.remove("write_execute_size")
  feature_names.remove("initial_iat_dll")
  feature_names.remove("initial_iat_func")
  feature_names.remove("initial_iat_malicious_func")
  feature_names.remove("number_add_exec_permission")
  feature_names.remove("number_add_write_permisison")

feature_names = np.concatenate((feature_names,static_feature_names),axis=None)

packed = pd.read_csv("features/features_packed.csv")
packed_static = pd.read_csv("features/static_features_packed.csv")
packed = packed.merge(packed_static, on='name')

notpacked = pd.read_csv("features/features_notpacked.csv")
notpacked_static = pd.read_csv("features/static_features_notpacked.csv")
notpacked = notpacked.merge(notpacked_static, on='name')

X = pd.concat([packed,notpacked], ignore_index = True)
X = X.drop(["name","write_execute_size","initial_iat_dll","initial_iat_func","initial_iat_malicious_func","number_add_exec_permission","number_add_write_permisison"], axis = 1)
y = np.append(np.ones(len(packed)),np.zeros(len(notpacked)))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [17]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_both_DT.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 3} with a score of 0.99375


In [18]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99732
The balance accuracy is 0.98515
The precision is 0.99466
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98       101
         1.0       0.99      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660



In [19]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_both_RF.pickle", "wb"))

The best parameters are {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 2, 'n_estimators': 12} with a score of 0.99792


In [20]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99821
The balance accuracy is 0.99010
The precision is 0.99643
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99       101
         1.0       1.00      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660



In [21]:
param_grid = {"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [40]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.5f"% (search.best_params_, search.best_score_))
model = search.best_estimator_
pickle.dump(model, open("models/clean_both_GBDT.pickle", "wb"))

The best parameters are {'max_depth': 3, 'min_samples_leaf': 12, 'n_estimators': 40} with a score of 1.00000


In [22]:
y_pred = model.predict(X_test)
print(
    "The F1-score is %0.5f"
    % (f1_score(y_test,y_pred))
)
print(
    "The balance accuracy is %0.5f"
    % (balanced_accuracy_score(y_test,y_pred))
)
print(
    "The precision is %0.5f"
    % (precision_score(y_test,y_pred))
)
print(
    "The recall is %0.5f"
    % (recall_score(y_test,y_pred))
)
print(classification_report(y_test,y_pred))

The F1-score is 0.99821
The balance accuracy is 0.99010
The precision is 0.99643
The recall is 1.00000
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99       101
         1.0       1.00      1.00      1.00       559

    accuracy                           1.00       660
   macro avg       1.00      0.99      0.99       660
weighted avg       1.00      1.00      1.00       660

