# Involving libraries


In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Data set -INFO


In [2]:
dataset_path = '/home/bot/myprojectdir/dataset/'

In [3]:
df = pd.read_csv(dataset_path+'processed/cleanDatasetWithEncoding.csv')

In [4]:
df.columns

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytess', 'FlowPacketss', 'FlowIATMean', 'FlowIATStd', 'FlowIATMax',
       'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd', 'FwdIATMax',
       'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd', 'BwdIATMax',
       'BwdIATMin', 'FwdPSHFlags', 'FwdURGFlags', 'FwdHeaderLength',
       'BwdHeaderLength', 'FwdPacketss', 'BwdPacketss', 'MinPacketLength',
       'MaxPacketLength', 'PacketLengthMean', 'PacketLengthStd',
       'PacketLengthVariance', 'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount',
       'PSHFlagCount', 'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount',
       'ECEFlagCount', 'DownUpRatio', 'AveragePacketSize', 'AvgFw

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520911 entries, 0 to 2520910
Data columns (total 71 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   DestinationPort          float64
 1   FlowDuration             float64
 2   TotalFwdPackets          float64
 3   TotalBackwardPackets     float64
 4   TotalLengthofFwdPackets  float64
 5   TotalLengthofBwdPackets  float64
 6   FwdPacketLengthMax       float64
 7   FwdPacketLengthMin       float64
 8   FwdPacketLengthMean      float64
 9   FwdPacketLengthStd       float64
 10  BwdPacketLengthMax       float64
 11  BwdPacketLengthMin       float64
 12  BwdPacketLengthMean      float64
 13  BwdPacketLengthStd       float64
 14  FlowBytess               float64
 15  FlowPacketss             float64
 16  FlowIATMean              float64
 17  FlowIATStd               float64
 18  FlowIATMax               float64
 19  FlowIATMin               float64
 20  FwdIATTotal              float64
 21  FwdIATMe

# Data classification
Preparation of training data

In [6]:
df['Label'].value_counts()

14    2095170
3      172846
1      128014
9       90694
2       10286
6        5931
5        5385
4        5228
10       3219
0        1948
11       1470
13        652
8          36
12         21
7          11
Name: Label, dtype: int64

In [7]:
old_memory_usage = df.memory_usage().sum()
#change the variable types for low memory usage
#int64 to int32,,, float64 to float32
integer = []
f = []
for i in df.columns[:-1]:
    if df[i].dtype == "int64": integer.append(i)
    else : f.append(i)

df[integer] = df[integer].astype("int32")
df[f] = df[f].astype("float32")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520911 entries, 0 to 2520910
Data columns (total 71 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   DestinationPort          float32
 1   FlowDuration             float32
 2   TotalFwdPackets          float32
 3   TotalBackwardPackets     float32
 4   TotalLengthofFwdPackets  float32
 5   TotalLengthofBwdPackets  float32
 6   FwdPacketLengthMax       float32
 7   FwdPacketLengthMin       float32
 8   FwdPacketLengthMean      float32
 9   FwdPacketLengthStd       float32
 10  BwdPacketLengthMax       float32
 11  BwdPacketLengthMin       float32
 12  BwdPacketLengthMean      float32
 13  BwdPacketLengthStd       float32
 14  FlowBytess               float32
 15  FlowPacketss             float32
 16  FlowIATMean              float32
 17  FlowIATStd               float32
 18  FlowIATMax               float32
 19  FlowIATMin               float32
 20  FwdIATTotal              float32
 21  FwdIATMe

In [8]:
new_memory_usage = df.memory_usage().sum()
old_vs_new = (old_memory_usage - new_memory_usage) / old_memory_usage * 100
print(f"%{old_vs_new} lower memory usage")

%49.295770241184364 lower memory usage


In [9]:
# drop one variable features 
one_variable_list = []
for i in df.columns:
    if df[i].value_counts().nunique() < 2:
        one_variable_list.append(i)
df.drop(one_variable_list,axis=1,inplace=True)
df.columns =  df.columns.str.strip()

In [10]:
# drop nan and infinite rows
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

In [11]:
# drop duplicate rows
df =  df.drop_duplicates(keep="first")
df.reset_index(drop=True,inplace=True)

In [12]:
#feature reduction 
#dropping very high correlated features 
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
df =  df.drop(to_drop, axis=1)
df.shape

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


(2497928, 48)

In [13]:
x = np.array(df.drop(["Label"],axis=1))
y = np.array(df["Label"])

# Model training

In [14]:
#Fitting Models
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=42,max_depth=15)

In [15]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [21]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_fscore_support,balanced_accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10)

scores = []
f1 = []
precision = []
recall = []

for train_index, test_index in folds.split(x,y):
    X_train, X_test, y_train, y_test = x[train_index], x[test_index], \
                                       y[train_index], y[test_index]
    DTC_Classifier.fit(X_train, y_train)
    predictions = DTC_Classifier.predict(X_test)
    f1.append(f1_score(predictions, y_test, average='weighted'))
    precision.append(precision_score(predictions, y_test, average='weighted'))
    recall.append(recall_score(predictions, y_test, average='weighted'))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
scores

[]

In [23]:
f1

[0.9959569731741943,
 0.9990158039120844,
 0.9942649455656892,
 0.999152442149916,
 0.9960137536744577,
 0.9980249222947251,
 0.9995433705757383,
 0.9985445737423977,
 0.9992141259835632,
 0.9398437672278583]

In [24]:
precision

[0.996479866611449,
 0.9991623586032322,
 0.9950377211878634,
 0.9992948819868055,
 0.996650495476963,
 0.9985599807153992,
 0.9996900438945938,
 0.9988439799460549,
 0.9997071308639519,
 0.9494380487659684]

In [25]:
recall

[0.9957284631675027,
 0.9989191050189556,
 0.9941831836760838,
 0.9990632243497616,
 0.9966532288735073,
 0.9977461337987854,
 0.9994515458799886,
 0.9983986741021565,
 0.9988190174224956,
 0.9456908147578785]

In [26]:
recall

[0.9957284631675027,
 0.9989191050189556,
 0.9941831836760838,
 0.9990632243497616,
 0.9966532288735073,
 0.9977461337987854,
 0.9994515458799886,
 0.9983986741021565,
 0.9988190174224956,
 0.9456908147578785]

In [27]:
np.mean(recall)

0.9924653391047116

In [28]:
np.mean(precision)

0.9932864508052281

In [29]:
np.mean(f1)

0.9919574678300623