In [1]:
import pandas as pd

data = pd.read_csv("../Datasets/MalDist_Dataset.csv", sep=",")

mal = data[data['label'] == 1].drop(columns=['label'])
legit = data[data['label'] == 0].drop(columns=['label'])

print(f"The shape of the legit dataset is {legit.shape[0]} samples, {legit.shape[1]} features")
print(f"The shape of the mal dataset is {mal.shape[0]} samples, {mal.shape[1]} features")

The shape of the legit dataset is 3050 samples, 336 features
The shape of the mal dataset is 67166 samples, 336 features


In [3]:
dridex = pd.read_csv("../Datasets/Dridex.csv", sep=",")
emotet = pd.read_csv("../Datasets/Emotet.csv", sep=",")
hancitor = pd.read_csv("../Datasets/Hancitor.csv", sep=",")
valak = pd.read_csv("../Datasets/Valak.csv", sep=",")
benign = pd.read_csv("../Datasets/Benign.csv", sep=",")

print(f"The shape of the dridex dataset is {dridex.shape[0]} samples, {dridex.shape[1]} features")
print(f"The shape of the emotet dataset is {emotet.shape[0]} samples, {emotet.shape[1]} features")
print(f"The shape of the hancitor dataset is {hancitor.shape[0]} samples, {hancitor.shape[1]} features")
print(f"The shape of the valak dataset is {valak.shape[0]} samples, {valak.shape[1]} features")
print(f"The shape of the benign dataset is {benign.shape[0]} samples, {benign.shape[1]} features")

The shape of the dridex dataset is 1987 samples, 337 features
The shape of the emotet dataset is 5597 samples, 337 features
The shape of the hancitor dataset is 58431 samples, 337 features
The shape of the valak dataset is 1151 samples, 337 features
The shape of the benign dataset is 3050 samples, 337 features


In [3]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
#from sklearn import cross_validation
import numpy as np



data_in = data.drop(['file_name', 'label', 'family'], axis=1).values
labels = data['label'].values




extratrees = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratrees, prefit=True)
data_in_new = select.transform(data_in)

print(data_in.shape, data_in_new.shape)

(69576, 334) (69576, 87)


In [6]:
import numpy as np

features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print(f"{f+1}, {data.columns[2+indices[f]]}, {importances[indices[f]]}")

1, packet_0_protocol_3, 0.07434112390795918
2, packet_2_protocol_3, 0.051606682397892935
3, packet_0_protocol_8, 0.048142857089074266
4, packet_3_protocol_3, 0.045298013963794535
5, packet_1_protocol_8, 0.027222018891014013
6, packet_8_protocol_3, 0.02516582105805351
7, packet_27_protocol_4, 0.018457024239617863
8, packet_7_protocol_3, 0.01758976690464767
9, packet_6_protocol_3, 0.016083557643283838
10, packet_0_protocol_5, 0.01461509376485666
11, packet_2_protocol_5, 0.01251061414897932
12, packet_2_protocol_8, 0.012348330836148806
13, packet_25_protocol_9, 0.01083402940704569
14, packet_5_protocol_3, 0.010491204562238256
15, packet_3_protocol_8, 0.010196399713088902
16, packet_9_protocol_3, 0.009789100093053296
17, packet_10_protocol_8, 0.009386943197082708
18, packet_1_protocol_3, 0.00935394905235704
19, packet_28_protocol_0, 0.009270948378605582
20, packet_30_protocol_9, 0.009039070266903194
21, packet_28_protocol_4, 0.00894768433576996
22, packet_27_protocol_0, 0.00877488678608168

In [7]:
from sklearn.ensemble import RandomForestClassifier

legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train, mal_train)

In [8]:
print(f"The score of the algorithm: {classif.score(legit_test,mal_test)*100}")

The score of the algorithm: 99.98562805403853


In [9]:
from sklearn.metrics import confusion_matrix

result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test, result)


print(conf_mat.shape)
print(type(conf_mat))
print(conf_mat)

print("False positives: ", conf_mat[0][1]/sum(conf_mat[0])*100)
print("False negatives: ", conf_mat[1][0]/sum(conf_mat[1])*100)

(2, 2)
<class 'numpy.ndarray'>
[[  604     2]
 [    0 13310]]
False positives:  0.33003300330033003
False negatives:  0.0


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train, mal_train)

In [11]:
print("The score of the Gradient Boosting Classifier is: ", grad_boost.score(legit_test, mal_test) * 100)

The score of the Gradient Boosting Classifier is:  99.95688416211556


## Maldist Model

Here we are going to follow the Maldist model by dividing the dataset into 4 classes Benign, Dridex, Emotet and Hacintor
But first we need to balanced the data
After that we can train our several models and compare their Accuracy, Precision, Recall and F1-Score
Then we can do a confussion matrix