This Jupyter notebook represents an improved way to utilize our classifiers. It is created defining functions in Python that execute calculations on the dataframes just before performing the training and testing of the various classifiers. It requires, obviously, data analysis (that has not been included into the notebook because we did it for the other version of the file) but gives the possibility of a better and cleaner representation of the classifiers.

In [1]:
# import all libraries

# cleaning data (we already analyzed the data in the other notebooks)
# need to perform some operation

# import data (note that this time we need to perform splitting to obtain train and test sets) - convert it into pd.DataFrame
# work on the column -TelephonyManager.getSimCountryIso- 
# - transform its element into string object (so we solved different types in the column)
# - dummy the column to obtain 3 columns 

# - NB. drop the third (the last of the new modified dataset) column obtained to drop '?' values during the definition of X and y

# prepare X and y, splitting the data (0.7 train 0.3 test)

# define the classifiers (with best hyperparameters found in the other notebooks)
# print the results 

##### Libraries

In [2]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, recall_score, precision_score, confusion_matrix, classification_report, 
                            balanced_accuracy_score, f1_score)

#####  Data

In [3]:
dataset = pd.read_csv('../ASSIGNMENT/data/AndroidMalware/drebin-215-dataset-5560malware-9476-benign.csv')

  interactivity=interactivity, compiler=compiler, result=result)


##### Transformation

In [4]:
def problematic_col(data):
    # string 
    data['TelephonyManager.getSimCountryIso'] = data['TelephonyManager.getSimCountryIso'].astype('str')
    # dummy
    data = pd.get_dummies(data, columns=['TelephonyManager.getSimCountryIso'])
    
    return data



#####  Designer of X and y

In [5]:
def designer_X_y(data):
    
    global X_train,y_train,X_test,y_test
    
    X_train, X_test, y_train, y_test = train_test_split(data.drop(['TelephonyManager.getSimCountryIso','class'],axis=1), data['class'], test_size=0.3)
    
    return X_train,y_train,X_test,y_test



##### Data preparator

In [6]:
def data_preparator(data):
    
    problematic_col(data)
    designer_X_y(data)
    
    return data



##### Classifier

In [7]:
# Decision Trees
def decision_trees(data):
    print('\ndecision tree classifier:\n')
    global clf_dt, y_pred_dt
    data_preparator(data)
    
    start_time = time.time()


    clf_dt = DecisionTreeClassifier(criterion='entropy',max_depth=300,max_leaf_nodes=300,splitter='best').fit(X_train,y_train)
    y_pred_dt = clf_dt.predict(X_test)

    elapsed_time = time.time() - start_time
    print('predicting -class-:\n')
    print("elapsed for training and predict: %.2f seconds" % elapsed_time,'\n')
    print('classification report:\n',classification_report(y_pred_dt,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_dt))
    
    return clf_dt,y_pred_dt
    
        
# Random Forest
def random_forest(data):
    print('\nrandom forest classifier:\n')
    global clf_rf, y_pred_rf
    data_preparator(data)
    
    start_time = time.time()

    clf_rf = RandomForestClassifier(criterion='gini',verbose=1,warm_start=True,max_depth=340,max_leaf_nodes=None,n_estimators=200).fit(X_train,y_train)
    y_pred_rf = clf_rf.predict(X_test)

    elapsed_time = time.time() - start_time
    print('predicting -class-:\n')
    print("elapsed for training and predict: %.2f seconds" % elapsed_time,'\n')
    print('classification report:\n',classification_report(y_pred_rf,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_rf))
    
    return clf_rf, y_pred_rf


# MLP    
def mlp(data):
    print('\nmulti-layer perceptron classifier:\n')
    global clf_mlp, y_pred_mlp
    data_preparator(data)
    
    start_time = time.time()
    
    clf_mlp = MLPClassifier(hidden_layer_sizes=(300, ), activation='relu', solver='adam',learning_rate='adaptive',verbose=True,tol=0.000001).fit(X_train,y_train)
    y_pred_mlp = clf_mlp.predict(X_test)

    elapsed_time = time.time() - start_time
    print('predicting -class-:\n')
    print("elapsed for training and predict: %.2f seconds" % elapsed_time,'\n')
    print('classification report:\n',classification_report(y_pred_mlp,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_mlp))
    
    return clf_mlp,y_pred_mlp


# SVC 
def support_vector_classifier(data):
    print('\nsupport vector classifier:\n')
    global clf_svc, y_pred_svc
    data_preparator(data)
    
    start_time = time.time()

    clf_svc = SVC(verbose=1,kernel='rbf',gamma='scale',decision_function_shape='ovo').fit(X_train,y_train)
    y_pred_svc = clf_svc.predict(X_test)

    elapsed_time = time.time() - start_time
    print('predicting -class-:\n')
    print("elapsed for training and predict: %.2f seconds" % elapsed_time,'\n')
    print('classification report:\n',classification_report(y_pred_svc,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_svc))
    
    return clf_mlp,y_pred_svc



In [8]:
decision_trees(dataset)


decision tree classifier:

predicting -class-:

elapsed for training and predict: 0.31 seconds 

classification report:
               precision    recall  f1-score   support

           B    0.97793   0.98309   0.98050      2839
           S    0.97103   0.96232   0.96666      1672

    accuracy                        0.97539      4511
   macro avg    0.97448   0.97271   0.97358      4511
weighted avg    0.97537   0.97539   0.97537      4511
 

confusion matrix:
 [[2791   63]
 [  48 1609]]


(DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=300,
                        max_features=None, max_leaf_nodes=300,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 array(['S', 'S', 'B', ..., 'S', 'B', 'S'], dtype=object))

In [9]:
random_forest(dataset)


random forest classifier:



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


predicting -class-:

elapsed for training and predict: 3.75 seconds 

classification report:
               precision    recall  f1-score   support

           B    0.99439   0.98303   0.98868      2887
           S    0.97043   0.99015   0.98019      1624

    accuracy                        0.98559      4511
   macro avg    0.98241   0.98659   0.98443      4511
weighted avg    0.98577   0.98559   0.98562      4511
 

confusion matrix:
 [[2838   16]
 [  49 1608]]


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.1s finished


(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=340, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=200,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=1, warm_start=True),
 array(['B', 'S', 'S', ..., 'S', 'S', 'B'], dtype=object))

In [10]:
mlp(dataset)


multi-layer perceptron classifier:

Iteration 1, loss = 0.28739907
Iteration 2, loss = 0.11681529
Iteration 3, loss = 0.08290392
Iteration 4, loss = 0.06776637
Iteration 5, loss = 0.05800263
Iteration 6, loss = 0.05269036
Iteration 7, loss = 0.04769571
Iteration 8, loss = 0.04316603
Iteration 9, loss = 0.03996329
Iteration 10, loss = 0.03752062
Iteration 11, loss = 0.03504785
Iteration 12, loss = 0.03188459
Iteration 13, loss = 0.02965732
Iteration 14, loss = 0.02848511
Iteration 15, loss = 0.02591183
Iteration 16, loss = 0.02494573
Iteration 17, loss = 0.02251050
Iteration 18, loss = 0.02117154
Iteration 19, loss = 0.01962463
Iteration 20, loss = 0.01904531
Iteration 21, loss = 0.01758673
Iteration 22, loss = 0.01623596
Iteration 23, loss = 0.01514944
Iteration 24, loss = 0.01461647
Iteration 25, loss = 0.01377075
Iteration 26, loss = 0.01282131
Iteration 27, loss = 0.01224537
Iteration 28, loss = 0.01170088
Iteration 29, loss = 0.01147859
Iteration 30, loss = 0.01070725
Iteration 31

(MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(300,), learning_rate='adaptive',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=1e-06,
               validation_fraction=0.1, verbose=True, warm_start=False),
 array(['B', 'B', 'S', ..., 'S', 'B', 'B'], dtype='<U1'))

In [11]:
support_vector_classifier(dataset)


support vector classifier:

[LibSVM]predicting -class-:

elapsed for training and predict: 6.96 seconds 

classification report:
               precision    recall  f1-score   support

           B    0.99161   0.97862   0.98507      2900
           S    0.96240   0.98510   0.97362      1611

    accuracy                        0.98094      4511
   macro avg    0.97701   0.98186   0.97935      4511
weighted avg    0.98118   0.98094   0.98098      4511
 

confusion matrix:
 [[2838   24]
 [  62 1587]]


(MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(300,), learning_rate='adaptive',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='adam', tol=1e-06,
               validation_fraction=0.1, verbose=True, warm_start=False),
 array(['S', 'B', 'B', ..., 'B', 'S', 'B'], dtype=object))

###### @falble

###### @gussr

###### @FiloLafro