In [1]:
import pandas as pd

dataset = pd.read_csv("MalwareData.csv", sep="|")
legit = dataset[0:41323].drop(["legitimate"], axis=1)
malware = dataset[41323::].drop(["legitimate"], axis=1)

print("Keseluruhan dataset yang bukan malware: %s samples, %s features"%(legit.shape[0],legit.shape[1]))
print("Keseluruhan dataset yang malware: %s samples, %s features"%(malware.shape[0],malware.shape[1]))

Keseluruhan dataset yang bukan malware: 41323 samples, 56 features
Keseluruhan dataset yang malware: 96724 samples, 56 features


In [2]:
pd.set_option("display.max_columns", None)

In [3]:
print(dataset.head(5))

           Name                               md5  Machine  \
0   memtest.exe  631ea355665f28d4707448e442fbf5b8      332   
1       ose.exe  9d10f99a6712e28f8acd5641e3a7ea6b      332   
2     setup.exe  4d92f518527353c0db88a70fddcfd390      332   
3      DW20.EXE  a41e524f8d45f0074fd07805ff0c9b12      332   
4  dwtrig20.exe  c87e561258f2f8650cef999bf643a731      332   

   SizeOfOptionalHeader  Characteristics  MajorLinkerVersion  \
0                   224              258                   9   
1                   224             3330                   9   
2                   224             3330                   9   
3                   224              258                   9   
4                   224              258                   9   

   MinorLinkerVersion  SizeOfCode  SizeOfInitializedData  \
0                   0      361984                 115712   
1                   0      130560                  19968   
2                   0      517120                 621568   
3 

In [5]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [6]:
data_in = dataset.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = dataset['legitimate'].values
extratree = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratree,prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape,data_in_new.shape)

(138047, 54) (138047, 13)


In [7]:
import numpy as np
features = data_in_new.shape[1]
importances = extratree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print("%d"%(f+1),dataset.columns[2+indices[f]],importances[indices[f]])

1 DllCharacteristics 0.1647168705092954
2 Machine 0.09510621811707166
3 Characteristics 0.08344847799000482
4 Subsystem 0.06592151133879208
5 SectionsMaxEntropy 0.05884342159126579
6 MajorSubsystemVersion 0.057626228313038465
7 ImageBase 0.057364230676831135
8 ResourcesMaxEntropy 0.052708998195569415
9 VersionInformationSize 0.05158199368987126
10 SizeOfOptionalHeader 0.04573894063354777
11 ResourcesMinEntropy 0.029350220584065227
12 MajorOperatingSystemVersion 0.027640806219715416
13 SectionsMinEntropy 0.026605546803038508


In [9]:
from sklearn.ensemble import RandomForestClassifier

legit_train, legit_test, malware_train, malware_test = train_test_split(data_in_new, labels, test_size=0.2)
classif = RandomForestClassifier(n_estimators=50)

classif.fit(legit_train,malware_train)

RandomForestClassifier(n_estimators=50)

In [10]:
print("Nilai akurasi algoritma random forest: ",classif.score(legit_test,malware_test)*100)

Nilai akurasi algoritma random forest:  99.42774357116987


In [11]:
from sklearn.tree import DecisionTreeClassifier

legit_train, legit_test, malware_train, malware_test = train_test_split(data_in_new, labels, test_size=0.2)
decisionif = DecisionTreeClassifier()

decisionif.fit(legit_train,malware_train)

DecisionTreeClassifier()

In [12]:
print("Nilai akurasi algoritma decision tree: ",decisionif.score(legit_test,malware_test)*100)

Nilai akurasi algoritma decision tree:  99.12350597609561
