This Jupyter notebook represents an improved way to utilize our classifiers. It is created defining functions in Python that execute calculations on the dataframes just before performing the training and testing of the various classifiers. It requires, obviously, data analysis (that has not been included into the notebook because we did it for the other version of the file) but gives the possibility of a better and cleaner representation of the classifiers.

In [1]:
# import all libraries

# cleaning data (we already analyzed the data in the other notebooks)
# need to perform some operations:

# import data (both train and test) - convert it into pd.dataframe
# transform datetime add two new columns
# transform bytes
# create ip category
# create cum sum ip addr
# dummy variable
# (no split because we already have two datasets)
# dropping useless columns
# scaling data

# define the classifiers (with best hyperparameters found in the other notebooks)
# print the results

# adding the opportunity to predict also attackType feature
# define new classifiers 
# print again the results

##### Libraries

In [2]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, recall_score, precision_score, confusion_matrix, classification_report, 
                            balanced_accuracy_score)

##### Data

In [3]:
# importing data 
train_dataframe = pd.read_csv('../ASSIGNMENT/data/NetworkTraffic/CIDDS-001-internal-week1_10pcSample.csv')
test_dataframe = pd.read_csv('../ASSIGNMENT/data/NetworkTraffic/CIDDS-001-internal-week2_10pcSample.csv')

numeric_cols = ['Duration','final_bytes','Packets','Cum Count Src IP Addr (10 seconds)','Cum Count Dst IP Addr (10 seconds)']
nominal_cols = ['Proto','Flags','Src IP category','Dst IP category','day-of-week','time-of-day']
other_cols = ['Src IP Addr','Dst IP Addr','Src Pt','Dst Pt','Tos','Date first seen']
label_cols = ['class','attackType','attackID','attackDescription']

  interactivity=interactivity, compiler=compiler, result=result)


##### Transformation

In [4]:
# transforming

# datetime
def datetime_transformator(train,test):
    
    for dataset in (train,test):
        
        dataset['Date first seen'] = pd.to_datetime(dataset['Date first seen'])
        dataset['day-of-week'] = dataset['Date first seen'].dt.dayofweek
        dataset['time-of-day'] = dataset['Date first seen'].dt.hour
    
    return train,test


# datetime
def bytes_transformator(train,test):
    
    for dataset in (train,test):
    
        bytes_dataset_values = []
        eval_dataset_bytes = []
        
        for value in dataset['Bytes'].astype('str').values:
            
            if value[-1] == 'M':
                new = value.replace('M','* 1000000')
                bytes_dataset_values.append(new)
            else:
                bytes_dataset_values.append(value)
            
        for value in bytes_dataset_values:
            final = eval(value)
            eval_dataset_bytes.append(final)
            
        dataset['final_bytes'] = eval_dataset_bytes

    return train,test


# working with ip
def add_ip_columns(train,test):
    
    for dataset in (train,test):
        
        src_ip_category = []
        
        for ip_addr in dataset['Src IP Addr']:
            if ip_addr == 'DNS':
                #return 'DNS'
                src_ip_category.append('DNS')
            elif ip_addr == 'EXT_SERVER':
                #return 'EXT_SERVER'
                src_ip_category.append('EXT_SERVER')
            elif len(ip_addr.split('.')) == 4:
                #return 'private'
                src_ip_category.append('private')
            elif len(ip_addr.split('_')) == 2:
                #return 'public'
                src_ip_category.append('public')
            else:
                #return '-'
                src_ip_category.append('-')
        
        dataset['Src IP category'] = src_ip_category
        
        dst_ip_category = []

        for ip_addr in dataset['Dst IP Addr']:
            if ip_addr == 'DNS':
                #return 'DNS'
                dst_ip_category.append('DNS')
            elif ip_addr == 'EXT_SERVER':
                #return 'EXT_SERVER'
                    dst_ip_category.append('EXT_SERVER')
            elif len(ip_addr.split('.')) == 4:
                #return 'private'
                    dst_ip_category.append('private')
            elif len(ip_addr.split('_')) == 2:
                #return 'public'
                    dst_ip_category.append('public')
            else:
                #return '-'
                    dst_ip_category.append('-')

        dataset['Dst IP category'] = dst_ip_category
        
        dataset['Cum Count Src IP Addr (10 seconds)'] = dataset.groupby(['Src IP Addr', pd.Grouper(freq='10S',key='Date first seen')]).cumcount()
        dataset['Cum Count Dst IP Addr (10 seconds)'] = dataset.groupby(['Dst IP Addr', pd.Grouper(freq='10S',key='Date first seen')]).cumcount()        
        
    return train,test



##### Transformation administrator

In [5]:
# transformation administrator
def transformation_administrator(train,test):
    
    datetime_transformator(train,test)
    bytes_transformator(train,test)
    add_ip_columns(train,test)
    
    return train,test



##### Designer of X and y

In [6]:
# designer of X y for -class-

def designer_X_y_for_class(train,test):
    
    global X_train,y_train,X_test,y_test
            
    # dropping and dummy    
    # dropping useless columns

    train = pd.get_dummies(train, columns=nominal_cols)    
    test = pd.get_dummies(test, columns=nominal_cols)
        
    
    # dividing train test stuff
    X_train = train.drop(['Unnamed: 0','Bytes','Flows','Src IP Addr','Dst IP Addr','Src Pt','Dst Pt','Tos','Date first seen','class','attackType','attackID','attackDescription'],axis=1)
    y_train = train['class']
    X_test = test.drop(['Unnamed: 0','Bytes','Flows','Src IP Addr','Dst IP Addr','Src Pt','Dst Pt','Tos','Date first seen','class','attackType','attackID','attackDescription'], axis=1)
    y_test = test['class']

    # scaling numeric col
    standard_scaler = StandardScaler().fit(X_train[numeric_cols])
    X_train[numeric_cols] = standard_scaler.transform(X_train[numeric_cols])
    X_test[numeric_cols] = standard_scaler.transform(X_test[numeric_cols])
    
    
    return train,test,X_train,y_train,X_test,y_test



In [7]:
# designer of X y for -attackType-
def designer_X_y_for_type(train,test):
    
    global X_train_type,y_train_type,X_test_type,y_test_type
            
    # dropping and dummy    
    # dropping useless columns
   
    train = pd.get_dummies(train, columns=nominal_cols)
    test = pd.get_dummies(test, columns=nominal_cols)
    
    # dividing train test stuff
    X_train_type = train.drop(['Unnamed: 0','Bytes','Flows','Src IP Addr','Dst IP Addr','Src Pt','Dst Pt','Tos','Date first seen','class','attackType','attackID','attackDescription'],axis=1)
    y_train_type = train['attackType']
    X_test_type = test.drop(['Unnamed: 0','Bytes','Flows','Src IP Addr','Dst IP Addr','Src Pt','Dst Pt','Tos','Date first seen','class','attackType','attackID','attackDescription'], axis=1)
    y_test_type = test['attackType']

    # scaling numeric col
    standard_scaler = StandardScaler().fit(X_train_type[numeric_cols])
    X_train_type[numeric_cols] = standard_scaler.transform(X_train_type[numeric_cols])
    X_test_type[numeric_cols] = standard_scaler.transform(X_test_type[numeric_cols])
    
    
    return train,test,X_train_type,y_train_type,X_test_type,y_test_type



##### Data Preparator

In [8]:
# data preparator for -class-
def data_preparator_for_class(train,test):
    
    global X_train,y_train,X_test,y_test
    
    transformation_administrator(train,test)
    designer_X_y_for_class(train,test)
    
    return X_train,y_train,X_test,y_test



In [9]:
# data preparator for -attackType-
def data_preparator_for_type(train,test):
    
    global X_train_type,y_train_type,X_test_type,y_test_type
    
    transformation_administrator(train,test)
    designer_X_y_for_type(train,test)
    
    return X_train_type,y_train_type,X_test_type,y_test_type



##### Classifier

In [10]:
# Classifiers -class-

# Logistic Regression
def logistic_regression(train,test):
    print('\nlogistic regression classifier:\n')
    global clf_lr, y_pred_lr
    data_preparator_for_class(train,test)
    
    start_time = time.time()
    
    clf_lr = LogisticRegression(verbose=True).fit(X_train,y_train)
    y_pred_lr = clf_lr.predict(X_test)
    
    elapsed_time = time.time() - start_time  
    print('predicting -class-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_lr,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_lr),'\n')

    return clf_lr,y_pred_lr


# GaussianNB
def gaussian_nb(train,test):
    print('\ngaussian naive bayes classifier:\n')
    global clf_nb, y_pred_nb
    
    data_preparator_for_class(train,test)

    start_time = time.time()
    
    clf_nb = GaussianNB().fit(X_train,y_train)
    y_pred_nb = clf_nb.predict(X_test)
    
    elapsed_time = time.time() - start_time 
    print('predicting -class-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_nb,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_nb))
    
    return clf_nb,y_pred_nb


# Decision Trees
def decision_trees(train,test):
    print('\ndecision tree classifier:\n')
    global clf_dt, y_pred_dt
    
    data_preparator_for_class(train,test)
    
    start_time = time.time()
    
    clf_dt = DecisionTreeClassifier(max_depth=45, max_leaf_nodes=100, criterion='gini', splitter='best').fit(X_train,y_train)
    y_pred_dt = clf_dt.predict(X_test)
    
    elapsed_time = time.time() - start_time  
    print('predicting -class-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_dt,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_dt))
    
    return clf_dt,y_pred_dt


# Random Forest
def random_forest(train,test):
    print('\nrandom forest classifier:\n')
    global clf_rf, y_pred_rf

    data_preparator_for_class(train,test)
    
    start_time = time.time()
    
    clf_rf = RandomForestClassifier(max_depth=45, max_leaf_nodes=100, criterion='gini', warm_start=True).fit(X_train,y_train)
    y_pred_rf = clf_rf.predict(X_test)
    
    elapsed_time = time.time() - start_time  
    print('predicting -class-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_rf,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_rf))    
    
    return clf_rf,y_pred_rf


# Multi-layer Perceptron 
def mlp(train,test):
    print('\nmulti-layer perceptron:\n')
    global clf_mlp, y_pred_mlp
    
    data_preparator_for_class(train,test)
    
    start_time = time.time()
    
    clf_mlp = MLPClassifier(hidden_layer_sizes=(200, ), activation='relu', solver='sgd',learning_rate='adaptive',verbose=True).fit(X_train,y_train)
    y_pred_mlp = clf_mlp.predict(X_test)
    
    elapsed_time = time.time() - start_time    
    print('predicting -class-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_mlp,y_test,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test,y_pred_mlp)) 
    
    return clf_mlp, y_pred_mlp
    


In [11]:
# Classifiers -attackType- 

# Logistic Regression
def logistic_regression_type(train,test):
    print('\nlogistic regression classifier:\n')
    global clf_lr_type, y_pred_type_lr
    
    data_preparator_for_type(train,test)
    
    start_time = time.time()
    
    clf_lr_type = LogisticRegression(verbose=True).fit(X_train_type,y_train_type)
    y_pred_type_lr = clf_lr_type.predict(X_test_type)
    
    elapsed_time = time.time() - start_time  
    print('predicting -attackType-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_type_lr,y_test_type,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test_type,y_pred_type_lr),'\n')

    return clf_lr_type,y_pred_type_lr


# GaussianNB
def gaussian_nb_type(train,test):
    print('\ngaussian nb classifier:\n')
    global clf_nb_type, y_pred_type_nb
    
    data_preparator_for_type(train,test)

    start_time = time.time()
    
    clf_nb_type = GaussianNB().fit(X_train_type,y_train_type)
    y_pred_type_nb = clf_nb_type.predict(X_test_type)
    
    elapsed_time = time.time() - start_time 
    print('predicting -attackType-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_type_nb,y_test_type,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test_type,y_pred_type_nb))
    
    return clf_nb_type,y_pred_type_nb


# Decision Trees
def decision_trees_type(train,test):
    print('\ndecision trees classifier:\n')
    global clf_dt_type, y_pred_type_dt
    
    data_preparator_for_type(train,test)
    
    start_time = time.time()
    
    clf_dt_type = DecisionTreeClassifier(max_depth=45, max_leaf_nodes=100, criterion='gini', splitter='best').fit(X_train_type,y_train_type)
    y_pred_type_dt = clf_dt_type.predict(X_test_type)
    
    elapsed_time = time.time() - start_time  
    print('predicting -attackType-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_type_dt,y_test_type,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test_type,y_pred_type_dt))
    
    return clf_dt_type,y_pred_type_dt


# Random Forest
def random_forest_type(train,test):
    print('\nrandom forest classifier:\n')
    global clf_rf_type, y_pred_type_rf

    data_preparator_for_type(train,test)
    
    start_time = time.time()
    
    clf_rf_type = RandomForestClassifier(max_depth=45, max_leaf_nodes=100, criterion='gini', warm_start=True).fit(X_train_type,y_train_type)
    y_pred_type_rf = clf_rf_type.predict(X_test_type)
    
    elapsed_time = time.time() - start_time  
    print('predicting -attackType-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_type_rf,y_test_type,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test_type,y_pred_type_rf))    
    
    return clf_rf_type,y_pred_type_rf


# Multi-layer Perceptron 
def mlp_type(train,test):
    print('\nmulti-layer perceptron:\n')
    global clf_mlp_type, y_pred_type_mlp
    
    data_preparator_for_type(train,test)
    
    start_time = time.time()
    
    clf_mlp_type = MLPClassifier(hidden_layer_sizes=(200, ), activation='relu', solver='sgd',learning_rate='adaptive',verbose=True).fit(X_train_type,y_train_type)
    y_pred_type_mlp = clf_mlp_type.predict(X_test_type)
    
    elapsed_time = time.time() - start_time    
    print('predicting -attackType-:\n')
    print('elapsed time for training and predict:',elapsed_time,'seconds\n')
    print('classification report:\n',classification_report(y_pred_type_mlp,y_test_type,digits=5),'\n')
    print('confusion matrix:\n',confusion_matrix(y_test_type,y_pred_type_mlp)) 
    
    return clf_mlp_type, y_pred_type_mlp
    


In [12]:
gaussian_nb(train_dataframe,test_dataframe)


gaussian naive bayes classifier:

predicting -class-:

elapsed time for training and predict: 8.267875671386719 seconds

classification report:
               precision    recall  f1-score   support

    attacker    0.99573   0.99784   0.99678     90816
      normal    0.99992   0.99925   0.99958    852372
      victim    0.99466   0.99893   0.99679     87885

    accuracy                        0.99910   1031073
   macro avg    0.99677   0.99867   0.99772   1031073
weighted avg    0.99910   0.99910   0.99910   1031073
 

confusion matrix:
 [[ 90620    348     41]
 [    18 851731     53]
 [   178    293  87791]]


(GaussianNB(priors=None, var_smoothing=1e-09),
 array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
       dtype='<U8'))

In [13]:
logistic_regression(train_dataframe,test_dataframe)


logistic regression classifier:





[LibLinear]predicting -class-:

elapsed time for training and predict: 23.580148935317993 seconds

classification report:
               precision    recall  f1-score   support

    attacker    0.99599   0.99752   0.99676     90869
      normal    0.99988   0.99950   0.99969    852123
      victim    0.99705   0.99910   0.99808     88081

    accuracy                        0.99929   1031073
   macro avg    0.99764   0.99871   0.99817   1031073
weighted avg    0.99929   0.99929   0.99929   1031073
 

confusion matrix:
 [[ 90644    355     10]
 [    36 851697     69]
 [   189     71  88002]] 



(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=True,
                    warm_start=False),
 array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
       dtype=object))

In [14]:
decision_trees(train_dataframe,test_dataframe)


decision tree classifier:

predicting -class-:

elapsed time for training and predict: 8.79945158958435 seconds

classification report:
               precision    recall  f1-score   support

    attacker    0.99318   0.99961   0.99638     90423
      normal    0.99996   0.99914   0.99955    852497
      victim    0.99858   0.99982   0.99920     88153

    accuracy                        0.99924   1031073
   macro avg    0.99724   0.99952   0.99838   1031073
weighted avg    0.99924   0.99924   0.99924   1031073
 

confusion matrix:
 [[ 90388    617      4]
 [    26 851764     12]
 [     9    116  88137]]


(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=45,
                        max_features=None, max_leaf_nodes=100,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
       dtype=object))

In [15]:
random_forest(train_dataframe,test_dataframe)


random forest classifier:





predicting -class-:

elapsed time for training and predict: 8.826380729675293 seconds

classification report:
               precision    recall  f1-score   support

    attacker    0.99301   0.99979   0.99639     90392
      normal    0.99999   0.99913   0.99956    852538
      victim    0.99862   0.99997   0.99929     88143

    accuracy                        0.99926   1031073
   macro avg    0.99721   0.99963   0.99841   1031073
weighted avg    0.99926   0.99926   0.99926   1031073
 

confusion matrix:
 [[ 90373    633      3]
 [     8 851794      0]
 [    11    111  88140]]


(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=45, max_features='auto', max_leaf_nodes=100,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=10,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=True),
 array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
       dtype=object))

In [16]:
mlp(train_dataframe,test_dataframe)


multi-layer perceptron:

Iteration 1, loss = 0.11470936
Iteration 2, loss = 0.02083602
Iteration 3, loss = 0.01172246
Iteration 4, loss = 0.00859972
Iteration 5, loss = 0.00708244
Iteration 6, loss = 0.00619104
Iteration 7, loss = 0.00559910
Iteration 8, loss = 0.00517254
Iteration 9, loss = 0.00484813
Iteration 10, loss = 0.00458998
Iteration 11, loss = 0.00438191
Iteration 12, loss = 0.00420896
Iteration 13, loss = 0.00406204
Iteration 14, loss = 0.00393647
Iteration 15, loss = 0.00383012
Iteration 16, loss = 0.00373839
Iteration 17, loss = 0.00365971
Iteration 18, loss = 0.00359119
Iteration 19, loss = 0.00353017
Iteration 20, loss = 0.00347504
Iteration 21, loss = 0.00342539
Iteration 22, loss = 0.00338190
Iteration 23, loss = 0.00334176
Iteration 24, loss = 0.00330490
Iteration 25, loss = 0.00327129
Iteration 26, loss = 0.00324070
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Setting learning rate to 0.000200
Iteration 27, loss = 0.00322084
Itera

(MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(200,), learning_rate='adaptive',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='sgd', tol=0.0001,
               validation_fraction=0.1, verbose=True, warm_start=False),
 array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
       dtype='<U8'))

In [17]:
logistic_regression_type(train_dataframe,test_dataframe)


logistic regression classifier:





[LibLinear]predicting -attackType-:

elapsed time for training and predict: 31.440825700759888 seconds



  'recall', 'true', average, warn_for)


classification report:
               precision    recall  f1-score   support

         ---    0.99989   0.99948   0.99968    852152
  bruteForce    0.00000   0.00000   0.00000         0
         dos    0.99995   0.99984   0.99989    170530
    pingScan    0.36965   0.68345   0.47980       139
    portScan    0.95161   0.93892   0.94522      8252

    accuracy                        0.99901   1031073
   macro avg    0.66422   0.72434   0.68492   1031073
weighted avg    0.99943   0.99901   0.99921   1031073
 

confusion matrix:
 [[851707      0     15      0     80]
 [    64      0     12      0    284]
 [     7      0 170503      0      2]
 [    24      0      0     95    138]
 [   350      0      0     44   7748]] 



(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=True,
                    warm_start=False),
 array(['---', '---', '---', ..., '---', '---', '---'], dtype=object))

In [18]:
decision_trees_type(train_dataframe,test_dataframe)


decision trees classifier:

predicting -attackType-:

elapsed time for training and predict: 8.987946510314941 seconds

classification report:
               precision    recall  f1-score   support

         ---    0.99997   0.99912   0.99955    852525
  bruteForce    0.00000   0.00000   0.00000         4
         dos    0.99960   0.99989   0.99974    170461
    pingScan    0.36576   0.61039   0.45742       154
    portScan    0.95407   0.97969   0.96671      7929

    accuracy                        0.99904   1031073
   macro avg    0.66388   0.71782   0.68468   1031073
weighted avg    0.99946   0.99904   0.99924   1031073
 

confusion matrix:
 [[851776      0     18      1      7]
 [   337      0      0      0     23]
 [    68      0 170443      0      1]
 [    33      0      0     94    130]
 [   311      4      0     59   7768]]


(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=45,
                        max_features=None, max_leaf_nodes=100,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 array(['---', '---', '---', ..., '---', '---', '---'], dtype=object))

In [19]:
random_forest_type(train_dataframe,test_dataframe)


random forest classifier:





predicting -attackType-:

elapsed time for training and predict: 9.360949277877808 seconds

classification report:
               precision    recall  f1-score   support

         ---    1.00000   0.99908   0.99954    852582
  bruteForce    0.00000   0.00000   0.00000         1
         dos    0.99938   0.99998   0.99968    170411
    pingScan    0.21012   0.58065   0.30857        93
    portScan    0.95677   0.97546   0.96602      7986

    accuracy                        0.99901   1031073
   macro avg    0.63325   0.71103   0.65476   1031073
weighted avg    0.99949   0.99901   0.99924   1031073
 

confusion matrix:
 [[851798      0      4      0      0]
 [   337      0      0      0     23]
 [   105      0 170407      0      0]
 [    30      0      0     54    173]
 [   312      1      0     39   7790]]


(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=45, max_features='auto', max_leaf_nodes=100,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=10,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=True),
 array(['---', '---', '---', ..., '---', '---', '---'], dtype=object))

In [20]:
mlp_type(train_dataframe,test_dataframe)


multi-layer perceptron:

Iteration 1, loss = 0.11059586
Iteration 2, loss = 0.02399720
Iteration 3, loss = 0.01508520
Iteration 4, loss = 0.01166479
Iteration 5, loss = 0.00985923
Iteration 6, loss = 0.00872458
Iteration 7, loss = 0.00792969
Iteration 8, loss = 0.00733232
Iteration 9, loss = 0.00686157
Iteration 10, loss = 0.00647651
Iteration 11, loss = 0.00615762
Iteration 12, loss = 0.00588751
Iteration 13, loss = 0.00565667
Iteration 14, loss = 0.00545491
Iteration 15, loss = 0.00527942
Iteration 16, loss = 0.00512430
Iteration 17, loss = 0.00498924
Iteration 18, loss = 0.00487086
Iteration 19, loss = 0.00476591
Iteration 20, loss = 0.00467072
Iteration 21, loss = 0.00458559
Iteration 22, loss = 0.00450655
Iteration 23, loss = 0.00443461
Iteration 24, loss = 0.00436843
Iteration 25, loss = 0.00430643
Iteration 26, loss = 0.00424874
Iteration 27, loss = 0.00419529
Iteration 28, loss = 0.00414486
Iteration 29, loss = 0.00409721
Iteration 30, loss = 0.00405301
Training loss did not i

  'recall', 'true', average, warn_for)


classification report:
               precision    recall  f1-score   support

         ---    0.99986   0.99948   0.99967    852119
  bruteForce    0.00000   0.00000   0.00000         0
         dos    0.99996   0.99965   0.99981    170565
    pingScan    0.01946   1.00000   0.03817         5
    portScan    0.95714   0.92951   0.94312      8384

    accuracy                        0.99894   1031073
   macro avg    0.59528   0.78573   0.59615   1031073
weighted avg    0.99952   0.99894   0.99923   1031073
 

confusion matrix:
 [[851679      0     47      0     76]
 [    64      0     12      0    284]
 [     3      0 170506      0      3]
 [    24      0      0      5    228]
 [   349      0      0      0   7793]]


(MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
               beta_2=0.999, early_stopping=False, epsilon=1e-08,
               hidden_layer_sizes=(200,), learning_rate='adaptive',
               learning_rate_init=0.001, max_iter=200, momentum=0.9,
               n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
               random_state=None, shuffle=True, solver='sgd', tol=0.0001,
               validation_fraction=0.1, verbose=True, warm_start=False),
 array(['---', '---', '---', ..., '---', '---', '---'], dtype='<U10'))

In [21]:
gaussian_nb_type(train_dataframe,test_dataframe)


gaussian nb classifier:

predicting -attackType-:

elapsed time for training and predict: 8.655802011489868 seconds

classification report:
               precision    recall  f1-score   support

         ---    0.98679   0.96815   0.97738    868206
  bruteForce    0.00000   0.00000   0.00000      2865
         dos    0.83777   1.00000   0.91173    142850
    pingScan    0.84825   0.09620   0.17281      2266
    portScan    0.91943   0.50289   0.65017     14886

    accuracy                        0.96124   1031073
   macro avg    0.71845   0.51345   0.54242   1031073
weighted avg    0.96213   0.96124   0.95908   1031073
 

confusion matrix:
 [[840550   2473      0   1739   7040]
 [    17      0      0     41    302]
 [ 27612      0 142850     31     19]
 [     0      0      0    218     39]
 [    27    392      0    237   7486]]


(GaussianNB(priors=None, var_smoothing=1e-09),
 array(['---', '---', '---', ..., '---', '---', '---'], dtype='<U10'))

###### @falble

###### @gussr

###### @FiloLafro