In [1]:
%pip install pymongo

Note: you may need to restart the kernel to use updated packages.


# Crate Directory and Output File

In [2]:
#Use '/resource' to syncronize folder with host

!mkdir -p ~/output/sklearn-model/naive-bayes

In [3]:
input_path  = "/home/jovyan/output/renamed-data/binary/TrainDataUnderSampling.csv"
output_path = "/home/jovyan/output/sklearn-model/naive-bayes"

# Import Package

In [4]:
import time
import pickle
import numpy as np
import pandas as pd
from gridfs import GridFS
from statistics import mean
from datetime import datetime
from bson.binary import Binary
import matplotlib.pyplot as plt
from pymongo import MongoClient

from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

# DB Connections

In [5]:
client = MongoClient("mongodb://mongodb:27017")
db = client['mataelanglab']

result_col = db['sklearn_result']
cv_col = db['sklearn_cv']
model_col = GridFS(db, 'sklearn_model')

# Read Data

In [6]:
df = pd.read_csv(input_path)

# Normalisasi

In [7]:
xNorm = df.drop(['Unnamed: 0','flow_id','src_ip','src_port','dst_ip','dst_port','protocol','timestamp','label'],axis=1)
yNorm = df[['label']]

In [8]:
stdScaler = StandardScaler()
normData = pd.DataFrame(stdScaler.fit_transform(xNorm), columns=xNorm.columns)
normData.head(5)

Unnamed: 0,flow_duration,total_fwd_packet,total_bwd_packets,total_length_of_fwd_packet,total_length_of_bwd_packet,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,bwd_packet_length_max,...,fwd_act_data_pkts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
0,-0.377311,-0.044437,-0.053995,-0.003726,-0.04656,3.075022,-0.474292,1.190075,3.450602,-0.380966,...,-0.036528,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.63293,-0.659378,-0.463769,0.650771
1,1.621775,-0.062786,-0.065338,-0.018863,-0.046926,-0.103881,-0.474292,-0.154518,0.089382,-0.522705,...,-0.036528,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,-1.465506,1.506739,-0.46353,-1.509516
2,-0.290783,0.295013,0.059432,1.548795,-0.046611,8.110218,-0.474292,17.184021,5.072583,-0.400652,...,1.264708,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.633043,-0.659378,-0.462282,0.65083
3,-0.410054,0.038132,0.082118,-0.010714,-0.028506,0.638636,-0.474292,-0.030761,0.348651,0.394661,...,0.305903,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.632809,-0.659378,-0.465364,0.650709
4,1.405972,0.010609,-0.008624,-0.013477,-0.045711,-0.115483,-0.474292,-0.062998,0.0104,-0.440024,...,0.168931,0.237673,0.448076,-0.057004,0.370849,0.484196,-1.604441,1.363325,-4.12398,-1.509516


# All Data without Feature Selection

## Processing

In [9]:
start_time = time.time()
# Split dataset into training set and test set
xtrain, xtest, ytrain, ytest = train_test_split(normData, yNorm, test_size=0.3, random_state=42)

# Mengaktifkan/memanggil/membuat fungsi klasifikasi Naive Bayes
nb = GaussianNB()

# Memasukkan data training pada fungsi klasifikasi Naive Bayes
nbModel = nb.fit(xtrain, ytrain)

# Menentukan hasil prediksi dari x_test
ypred = nbModel.predict(xtest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

  y = column_or_1d(y, warn=True)


--- 0.4953761100769043 seconds ---


In [10]:
print("confusion matrix :",confusion_matrix(ytest, ypred).ravel())
tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [12068 20601  2862 29910]
accuracy :  0.6415
recall : 0.9127
precision :  0.5921
f1-measure : 0.7183


In [11]:
# Save Pickle
path = output_path+"/all-feature.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(nbModel, f)

In [12]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="NaiveBayes-AllFeature.pkl")
    
result_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "All Feature",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f5e7a9cd430>

## Cross Validation Process

In [13]:
X=np.array(normData)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    nb = GaussianNB()
    nbModel = nb.fit(X_train, y_train)
    
    y_pred = nbModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [ 3674  7192   849 10099]
accuracy :  0.6314
recall : 0.9225
precision :  0.5841
f1-measure : 0.7153
confusion matrix : [ 3755  7165   882 10012]
accuracy :  0.6311
recall : 0.9190
precision :  0.5829
f1-measure : 0.7133
confusion matrix : [ 3765  7132   822 10095]
accuracy :  0.6354
recall : 0.9247
precision :  0.5860
f1-measure : 0.7174
confusion matrix : [ 3721  7161   872 10060]
accuracy :  0.6318
recall : 0.9202
precision :  0.5842
f1-measure : 0.7147
confusion matrix : [ 3721  7240   802 10050]
accuracy :  0.6313
recall : 0.9261
precision :  0.5813
f1-measure : 0.7142
confusion matrix : [ 3612  7238   831 10132]
accuracy :  0.6301
recall : 0.9242
precision :  0.5833
f1-measure : 0.7152
confusion matrix : [ 3672  7268   850 10023]
accuracy :  0.6278
recall : 0.9218
precision :  0.5797
f1-measure : 0.7118
confusion matrix : [3730 7282  885 9916]
accuracy :  0.6256
recall : 0.9181
precision :  0.5766
f1-measure : 0.7083
confusion matrix : [ 3743  7126   814 10130]

In [14]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "All Feature",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.631212902333428
recall :  0.9225746447041068
precision :  0.5829015272636625
f1_score :  0.7144158457782008


<pymongo.results.InsertOneResult at 0x7f5e72769e80>

# Using Feature Importance

## Processing

Fitur yang digunakan dihitung pada proses machine learning pada Decission Tree

In [15]:
importance = normData[[
    'idle_max',
    'fwd_init_win_bytes',
    'fwd_header_length',
    'bwd_packets_per_s',
    'flow_bytes_per_s',
    'bwd_init_win_bytes',
    'flow_iat_max'
]]

In [16]:
start_time = time.time()
# Split dataset into training set and test set
xftrain, xftest, yftrain, yftest = train_test_split(importance, yNorm, test_size=0.3, random_state=42)

# Mengaktifkan/memanggil/membuat fungsi klasifikasi Naive Bayes
nb = GaussianNB()

# Memasukkan data training pada fungsi klasifikasi Naive Bayes
nbModel = nb.fit(xftrain, yftrain)

# Menentukan hasil prediksi dari x_test
yfpred = nbModel.predict(xftest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

--- 0.09663105010986328 seconds ---


  y = column_or_1d(y, warn=True)


In [17]:
print("confusion matrix :",confusion_matrix(yftest, yfpred).ravel())
tn, fp, fn, tp = confusion_matrix(yftest, yfpred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [ 7763 24906  1551 31221]
accuracy :  0.5957
recall : 0.9527
precision :  0.5563
f1-measure : 0.7024


In [18]:
# Save Pickle
path = output_path+"/feature-importance.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(nbModel, f)

In [19]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="NaiveBayes-FeatureImportance.pkl")
    
result_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "Feature Importance",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f5e80ab2880>

## Cross Validation Process

In [20]:
X=np.array(importance)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    nb = GaussianNB()
    nbModel = nb.fit(X_train, y_train)
    
    y_pred = nbModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [ 2600  8301   511 10402]
accuracy :  0.5960
recall : 0.9532
precision :  0.5562
f1-measure : 0.7025
confusion matrix : [ 2590  8312   515 10397]
accuracy :  0.5954
recall : 0.9528
precision :  0.5557
f1-measure : 0.7020
confusion matrix : [ 2601  8149   510 10554]
accuracy :  0.6031
recall : 0.9539
precision :  0.5643
f1-measure : 0.7091
confusion matrix : [ 2652  8246   530 10386]
accuracy :  0.5977
recall : 0.9514
precision :  0.5574
f1-measure : 0.7030
confusion matrix : [ 2599  8353   514 10347]
accuracy :  0.5935
recall : 0.9527
precision :  0.5533
f1-measure : 0.7000
confusion matrix : [ 2475  8485   673 10180]
accuracy :  0.5802
recall : 0.9380
precision :  0.5454
f1-measure : 0.6897
confusion matrix : [ 2626  8300   532 10355]
accuracy :  0.5951
recall : 0.9511
precision :  0.5551
f1-measure : 0.7010
confusion matrix : [ 2687  8303   505 10318]
accuracy :  0.5962
recall : 0.9533
precision :  0.5541
f1-measure : 0.7009
confusion matrix : [ 2579  8289   576 10

In [21]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "Feature Importance",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.5940980510086084
recall :  0.9493253890009423
precision :  0.5550094348398613
f1_score :  0.7004832043133715


<pymongo.results.InsertOneResult at 0x7f5e7a9f5cd0>

# Using PCA

## Processing

In [22]:
start_time = time.time()
pca = PCA(n_components=2)
pca_dataset = pd.DataFrame(pca.fit_transform(normData), columns=[
    "Feature 1",
    "Feature 2"
])
#pca_dataset["Label"] = pca_label["Label"]
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.735572338104248 seconds ---


In [23]:
print(pca.explained_variance_)

[10.88449402  9.09396116]


In [24]:
a = 0
for i in pca.explained_variance_:
  a = a+i
print(a)

19.978455173533803


In [25]:
start_time = time.time()
# Split dataset into training set and test set
xptrain, xptest, yptrain, yptest = train_test_split(pca_dataset, yNorm, test_size=0.3, random_state=42)

# Mengaktifkan/memanggil/membuat fungsi klasifikasi Naive Bayes
nb = GaussianNB()

# Memasukkan data training pada fungsi klasifikasi Naive Bayes
nbModel = nb.fit(xptrain, yptrain)

# Menentukan hasil prediksi dari x_test
yppred = nbModel.predict(xptest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

--- 0.07073616981506348 seconds ---


  y = column_or_1d(y, warn=True)


In [26]:
print("confusion matrix :",confusion_matrix(yptest, yppred).ravel())
tn, fp, fn, tp = confusion_matrix(yptest, yppred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [ 9928 22741  2541 30231]
accuracy :  0.6137
recall : 0.9225
precision :  0.5707
f1-measure : 0.7051


In [27]:
# Save Pickle
path = output_path+"/pca.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(nbModel, f)

In [28]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="NaiveBayes-PCA.pkl")
    
result_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "PCA",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f5e80ab2a90>

## Cross Validation Process

In [29]:
X=np.array(pca_dataset)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    nb = GaussianNB()
    nbModel = nb.fit(X_train, y_train)
    
    y_pred = nbModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [ 3299  7650   858 10007]
accuracy :  0.6100
recall : 0.9210
precision :  0.5667
f1-measure : 0.7017
confusion matrix : [ 3340  7577   839 10058]
accuracy :  0.6142
recall : 0.9230
precision :  0.5703
f1-measure : 0.7050
confusion matrix : [ 3227  7616   819 10152]
accuracy :  0.6133
recall : 0.9253
precision :  0.5714
f1-measure : 0.7065
confusion matrix : [3352 7679  875 9908]
accuracy :  0.6079
recall : 0.9189
precision :  0.5634
f1-measure : 0.6985
confusion matrix : [ 3265  7621   847 10080]
accuracy :  0.6118
recall : 0.9225
precision :  0.5695
f1-measure : 0.7042
confusion matrix : [ 3263  7601   866 10083]
accuracy :  0.6118
recall : 0.9209
precision :  0.5702
f1-measure : 0.7043
confusion matrix : [ 3354  7501   844 10114]
accuracy :  0.6174
recall : 0.9230
precision :  0.5742
f1-measure : 0.7079
confusion matrix : [ 3241  7683   814 10075]
accuracy :  0.6105
recall : 0.9252
precision :  0.5673
f1-measure : 0.7034
confusion matrix : [ 3329  7604   830 10050]

In [30]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Naive Bayes",
    'feature': "PCA",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.6124125734369721
recall :  0.9224010799322087
precision :  0.5693900972779253
f1_score :  0.7041246076739285


<pymongo.results.InsertOneResult at 0x7f5e7a9a0f10>