# Classifier Comparison

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score,cross_validate

from xgboost import XGBClassifier

import warnings; warnings.simplefilter('ignore')

In [19]:
df=pd.read_csv("malware_classification_tf_idf.csv.zip")

In [28]:
names = ["KNN", "Linear SVM", "RBF SVM", \
         "Decision Tree", "Random Forest", "MLP Classifier", "AdaBoost",
         "Naive Bayes", "QDA", "XG Boost"]

In [29]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier()]


In [21]:
Y1=df['Label']
X1=df.drop(columns=['FileName','Label'])

In [32]:
X1

Unnamed: 0,00,01,02,03,04,06,10,20,8b,cc,code,data,db,dd,eax,ff,mov,rata,rdata,text
0,0.647438,0.022379,0.010803,0.006945,0.013119,0.000773,0.077168,0.044757,0.040127,0.019410,0.045584,0.135585,0.108807,0.081798,0.106884,0.083341,0.069435,0.000000,0.365760,0.611620
1,0.302425,0.019032,0.010945,0.008344,0.020346,0.008998,0.017775,0.008430,0.085415,0.002314,0.048811,0.091719,0.062382,0.045594,0.142130,0.140068,0.141046,0.000000,0.000000,0.907373
2,0.419734,0.022625,0.011212,0.004338,0.025295,0.005479,0.084427,0.010011,0.103916,0.092530,0.040762,0.123533,0.199088,0.061135,0.142441,0.065940,0.162954,0.000000,0.340417,0.754014
3,0.237458,0.023103,0.010835,0.008384,0.017643,0.006709,0.017124,0.007279,0.085094,0.002497,0.060484,0.100979,0.091358,0.018029,0.178064,0.195123,0.139903,0.000000,0.000000,0.907228
4,0.418235,0.037460,0.009134,0.015058,0.028882,0.006302,0.063812,0.010245,0.087325,0.053584,0.063148,0.217987,0.204705,0.016786,0.164531,0.065170,0.163720,0.000000,0.061497,0.807603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,0.292281,0.001950,0.001802,0.001575,0.001410,0.001978,0.001673,0.001478,0.002573,0.001550,0.000490,0.678781,0.673569,0.001640,0.001744,0.002042,0.002261,0.000000,0.001470,0.009213
817,0.865998,0.043174,0.024708,0.022167,0.039768,0.027725,0.018983,0.014117,0.067391,0.034718,0.001863,0.001735,0.013363,0.316630,0.006870,0.160982,0.000250,0.000000,0.003771,0.334678
818,0.336364,0.003180,0.001766,0.001887,0.001926,0.001734,0.002504,0.001607,0.003626,0.001496,0.003160,0.668594,0.661004,0.003502,0.008713,0.006456,0.006802,0.000000,0.008104,0.051145
819,0.868732,0.042112,0.024526,0.021180,0.039345,0.029359,0.018765,0.012336,0.065833,0.010041,0.001649,0.003777,0.014541,0.314224,0.006335,0.157927,0.000234,0.000000,0.004994,0.333407


In [31]:
from beautifultable import BeautifulTable
table = BeautifulTable()
for model, name in zip(classifiers, names):    
    # Create a pipeline that scales the data then trains a LogisticRegression classifier
    classifier_pipeline = make_pipeline(preprocessing.StandardScaler(), model)

    score_accuracy = cross_val_score(classifier_pipeline, X1, Y1, cv=10, scoring='accuracy').mean()
    score_f1 = cross_val_score(classifier_pipeline, X1, Y1, cv=10, scoring='f1_macro').mean()
    score_precision = cross_val_score(classifier_pipeline, X1, Y1, cv=10, scoring='precision_macro').mean()
    score_recall = cross_val_score(classifier_pipeline, X1, Y1, cv=10, scoring='recall_macro').mean()

    table.rows.append([score_accuracy,score_f1,score_precision,score_recall])
    
    #print_stats_metrices(name,score_accuracy,score_f1,score_precision,score_recall)

table.rows.header = names
table.columns.header = ["Accuracy", "F1", "Precision" , "Recall"]
print(table)

+----------------+----------+-------+-----------+--------+
|                | Accuracy |  F1   | Precision | Recall |
+----------------+----------+-------+-----------+--------+
|      KNN       |  0.901   | 0.88  |   0.889   | 0.888  |
+----------------+----------+-------+-----------+--------+
|   Linear SVM   |  0.783   | 0.715 |   0.746   | 0.723  |
+----------------+----------+-------+-----------+--------+
|    RBF SVM     |  0.807   | 0.782 |   0.851   | 0.776  |
+----------------+----------+-------+-----------+--------+
| Decision Tree  |  0.776   | 0.723 |   0.749   | 0.741  |
+----------------+----------+-------+-----------+--------+
| Random Forest  |  0.868   | 0.828 |   0.852   | 0.831  |
+----------------+----------+-------+-----------+--------+
| MLP Classifier |  0.874   | 0.839 |   0.862   | 0.849  |
+----------------+----------+-------+-----------+--------+
|    AdaBoost    |  0.306   | 0.207 |   0.234   | 0.279  |
+----------------+----------+-------+-----------+-------