In [1]:
%pip install pymongo 

Note: you may need to restart the kernel to use updated packages.


# Crate Directory and Output File

In [2]:
#Use '/resource' to syncronize folder with host

!mkdir -p ~/output/sklearn-model/random-forest

In [3]:
input_path  = "/home/jovyan/output/renamed-data/binary/TrainDataUnderSampling.csv"
output_path = "/home/jovyan/output/sklearn-model/random-forest"

# Import Package

In [4]:
import time
import pickle
import numpy as np
import pandas as pd
import pymongo, gridfs
from gridfs import GridFS
from statistics import mean
from datetime import datetime
from bson.binary import Binary
import matplotlib.pyplot as plt
from pymongo import MongoClient

from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

# DB Connections

In [5]:
client = MongoClient("mongodb://mongodb:27017")
db = client['mataelanglab']

result_col = db['sklearn_result']
cv_col = db['sklearn_cv']
model_col = GridFS(db, 'sklearn_model')

# Read Data

In [6]:
df = pd.read_csv(input_path)

# Normalisasi

In [7]:
xNorm = df.drop(['Unnamed: 0','flow_id','src_ip','src_port','dst_ip','dst_port','protocol','timestamp','label'],axis=1)
yNorm = df[['label']]

In [8]:
stdScaler = StandardScaler()
normData = pd.DataFrame(stdScaler.fit_transform(xNorm), columns=xNorm.columns)
normData.head(5)

Unnamed: 0,flow_duration,total_fwd_packet,total_bwd_packets,total_length_of_fwd_packet,total_length_of_bwd_packet,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,bwd_packet_length_max,...,fwd_act_data_pkts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
0,-0.377311,-0.044437,-0.053995,-0.003726,-0.04656,3.075022,-0.474292,1.190075,3.450602,-0.380966,...,-0.036528,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.63293,-0.659378,-0.463769,0.650771
1,1.621775,-0.062786,-0.065338,-0.018863,-0.046926,-0.103881,-0.474292,-0.154518,0.089382,-0.522705,...,-0.036528,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,-1.465506,1.506739,-0.46353,-1.509516
2,-0.290783,0.295013,0.059432,1.548795,-0.046611,8.110218,-0.474292,17.184021,5.072583,-0.400652,...,1.264708,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.633043,-0.659378,-0.462282,0.65083
3,-0.410054,0.038132,0.082118,-0.010714,-0.028506,0.638636,-0.474292,-0.030761,0.348651,0.394661,...,0.305903,0.237673,-0.314924,-0.057004,-0.293148,-0.313532,0.632809,-0.659378,-0.465364,0.650709
4,1.405972,0.010609,-0.008624,-0.013477,-0.045711,-0.115483,-0.474292,-0.062998,0.0104,-0.440024,...,0.168931,0.237673,0.448076,-0.057004,0.370849,0.484196,-1.604441,1.363325,-4.12398,-1.509516


# All Data without Feature Selection

## Processing

In [9]:
start_time = time.time()
xtrain, xtest, ytrain, ytest = train_test_split(normData, yNorm, test_size=0.3, random_state=42)

rf = RandomForestClassifier()

rfModel = rf.fit(xtrain, ytrain)

ypred = rfModel.predict(xtest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

  rfModel = rf.fit(xtrain, ytrain)


--- 50.94302439689636 seconds ---


In [10]:
print("confusion matrix :",confusion_matrix(ytest, ypred).ravel())
tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [32266   403   512 32260]
accuracy :  0.9860
recall : 0.9844
precision :  0.9877
f1-measure : 0.9860


In [11]:
# Save Pickle
path = output_path+"/all-feature.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(rfModel, f)

In [12]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="RandomForest-AllFeature.pkl")
    
result_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "All Feature",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f94c8393340>

## Cross Validation

In [13]:
X=np.array(normData)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    rf = RandomForestClassifier()
    rfModel = rf.fit(X_train,y_train)
    
    y_pred = rfModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [10799   133   166 10716]
accuracy :  0.9863
recall : 0.9847
precision :  0.9877
f1-measure : 0.9862
confusion matrix : [10798   134   167 10715]
accuracy :  0.9862
recall : 0.9847
precision :  0.9876
f1-measure : 0.9861
confusion matrix : [10888   140   178 10608]
accuracy :  0.9854
recall : 0.9835
precision :  0.9870
f1-measure : 0.9852
confusion matrix : [10810   107   166 10731]
accuracy :  0.9875
recall : 0.9848
precision :  0.9901
f1-measure : 0.9874
confusion matrix : [10871   140   167 10635]
accuracy :  0.9859
recall : 0.9845
precision :  0.9870
f1-measure : 0.9858
confusion matrix : [10703   118   152 10840]
accuracy :  0.9876
recall : 0.9862
precision :  0.9892
f1-measure : 0.9877
confusion matrix : [10780   145   194 10694]
accuracy :  0.9845
recall : 0.9822
precision :  0.9866
f1-measure : 0.9844
confusion matrix : [10700   135   148 10830]
accuracy :  0.9870
recall : 0.9865
precision :  0.9877
f1-measure : 0.9871
confusion matrix : [10710   135   176 10

In [14]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "All Feature",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.9861873864116105
recall :  0.9844106876070974
precision :  0.9879159626681969
f1_score :  0.9861598252250364


<pymongo.results.InsertOneResult at 0x7f94d88adbe0>

# Using Feature Importance

## Processing

In [15]:
importance = normData[[
    'idle_max',
    'fwd_init_win_bytes',
    'fwd_header_length',
    'bwd_packets_per_s',
    'flow_bytes_per_s',
    'bwd_init_win_bytes',
    'flow_iat_max'
]]

In [16]:
start_time = time.time()
xftrain, xftest, yftrain, yftest = train_test_split(importance, yNorm, test_size=0.3, random_state=42)

rf = RandomForestClassifier()

rfModel = rf.fit(xftrain, yftrain)

yfpred = rfModel.predict(xftest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

  rfModel = rf.fit(xftrain, yftrain)


--- 28.576642751693726 seconds ---


In [17]:
print("confusion matrix :",confusion_matrix(yftest, yfpred).ravel())
tn, fp, fn, tp = confusion_matrix(yftest, yfpred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [32162   507   593 32179]
accuracy :  0.9832
recall : 0.9819
precision :  0.9845
f1-measure : 0.9832


In [18]:
# Save Pickle
path = output_path+"/feature-importance.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(rfModel, f)

In [19]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="RandomForest-FeatureImportance.pkl")
    
result_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "Feature Importance",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f94d88b2e20>

## Cross Validation

In [20]:
X=np.array(importance)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    rf = RandomForestClassifier()
    rfModel = rf.fit(X_train,y_train)
    
    y_pred = rfModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [10722   197   175 10720]
accuracy :  0.9829
recall : 0.9839
precision :  0.9820
f1-measure : 0.9829
confusion matrix : [10659   174   193 10788]
accuracy :  0.9832
recall : 0.9824
precision :  0.9841
f1-measure : 0.9833
confusion matrix : [10710   164   180 10760]
accuracy :  0.9842
recall : 0.9835
precision :  0.9850
f1-measure : 0.9843
confusion matrix : [10663   170   214 10767]
accuracy :  0.9824
recall : 0.9805
precision :  0.9845
f1-measure : 0.9825
confusion matrix : [10801   166   195 10651]
accuracy :  0.9835
recall : 0.9820
precision :  0.9847
f1-measure : 0.9833
confusion matrix : [10643   169   178 10823]
accuracy :  0.9841
recall : 0.9838
precision :  0.9846
f1-measure : 0.9842
confusion matrix : [10778   158   201 10676]
accuracy :  0.9835
recall : 0.9815
precision :  0.9854
f1-measure : 0.9835
confusion matrix : [10778   187   200 10648]
accuracy :  0.9823
recall : 0.9816
precision :  0.9827
f1-measure : 0.9822
confusion matrix : [10711   187   192 10

In [21]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "Feature Importance",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.9832671675165711
recall :  0.9825061505916469
precision :  0.9840041287028097
f1_score :  0.983253877164383


<pymongo.results.InsertOneResult at 0x7f94d88b4490>

# Using PCA

## Processing

In [22]:
start_time = time.time()
pca = PCA(n_components=7)
pca_dataset = pd.DataFrame(pca.fit_transform(normData), columns=[
    "Feature 1",
    "Feature 2",
    "Feature 3",
    "Feature 4",
    "Feature 5",
    "Feature 6",
    "Feature 7"
])
#pca_dataset["Label"] = pca_label["Label"]
print("--- %s seconds ---" % (time.time() - start_time))

--- 3.5600757598876953 seconds ---


In [23]:
print(pca.explained_variance_)

[10.88449402  9.09396116  6.65470816  5.49491754  4.40487123  3.77902179
  3.05000117]


In [24]:
a = 0
for i in pca.explained_variance_:
  a = a+i
print(a)

43.36197506474955


In [25]:
start_time = time.time()
xptrain, xptest, yptrain, yptest = train_test_split(pca_dataset, yNorm, test_size=0.3, random_state=42)

rf = RandomForestClassifier()

rfModel = rf.fit(xptrain, yptrain)

yppred = rfModel.predict(xptest)

duration = (time.time() - start_time)
print("--- %s seconds ---" % duration)

  rfModel = rf.fit(xptrain, yptrain)


--- 48.01999115943909 seconds ---


In [26]:
print("confusion matrix :",confusion_matrix(yptest, yppred).ravel())
tn, fp, fn, tp = confusion_matrix(yptest, yppred).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / ( tp + fn)
precision = tp / (tp + fp)
f1_score = 2 * (precision * recall) / (precision + recall)

print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
      "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [32002   667   648 32124]
accuracy :  0.9799
recall : 0.9802
precision :  0.9797
f1-measure : 0.9799


In [27]:
# Save Pickle
path = output_path+"/pca.pkl"
with open(path, "wb") as f:
    pickle_model = pickle.dump(rfModel, f)

In [28]:
# Store to MongoDB

with open(path, "rb") as f:
    model_col.put(f, filename="RandomForest-PCA.pkl")
    
result_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "PCA",
    'label': "Binary-Label",
    'duration': duration,
    'accuracy': accuracy,
    'recall': recall,
    'precision': precision,
    'f1_score': f1_score,
    'created_at': datetime.fromtimestamp(time.time())
})

<pymongo.results.InsertOneResult at 0x7f94d88ad910>

## Cross Validation

In [29]:
X=np.array(pca_dataset)
y=np.array(yNorm.values.ravel())

accList = []
recList = []
precList=[]
f1List=[]

kf= KFold(n_splits=10,shuffle=True)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X) :
    X_train,X_test = X[train_index],X[test_index]
    y_train,y_test = y[train_index],y[test_index]
    
    rf = RandomForestClassifier()
    rfModel = rf.fit(X_train,y_train)
    
    y_pred = rfModel.predict(X_test)
    print("confusion matrix :", confusion_matrix(y_test,y_pred).ravel())
    tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    accList.append(accuracy)
    recList.append(recall)
    precList.append(precision)
    f1List.append(f1_score)
    
    print(str('accuracy :  %0.4f' % accuracy) +"\n" + str('recall : %0.4f' % recall) +
          "\n" + str('precision :  %0.4f' %precision) + "\n" + str('f1-measure : %0.4f' %f1_score))

confusion matrix : [10749   213   225 10627]
accuracy :  0.9799
recall : 0.9793
precision :  0.9804
f1-measure : 0.9798
confusion matrix : [10653   219   251 10691]
accuracy :  0.9785
recall : 0.9771
precision :  0.9799
f1-measure : 0.9785
confusion matrix : [10628   217   201 10768]
accuracy :  0.9808
recall : 0.9817
precision :  0.9802
f1-measure : 0.9810
confusion matrix : [10678   207   235 10694]
accuracy :  0.9797
recall : 0.9785
precision :  0.9810
f1-measure : 0.9798
confusion matrix : [10861   193   183 10576]
accuracy :  0.9828
recall : 0.9830
precision :  0.9821
f1-measure : 0.9825
confusion matrix : [10611   190   212 10800]
accuracy :  0.9816
recall : 0.9807
precision :  0.9827
f1-measure : 0.9817
confusion matrix : [10724   214   207 10668]
accuracy :  0.9807
recall : 0.9810
precision :  0.9803
f1-measure : 0.9806
confusion matrix : [10714   242   223 10634]
accuracy :  0.9787
recall : 0.9795
precision :  0.9777
f1-measure : 0.9786
confusion matrix : [10694   211   216 10

In [30]:
print('accuracy : ',(mean(accList)))
print('recall : ',(mean(recList)))
print('precision : ',(mean(precList)))
print('f1_score : ',(mean(f1List)))

cv_col.insert_one({
    'machine_learning': "Random Forest",
    'feature': "PCA",
    'label': "Binary-Label",
    'accuracy': (mean(accList)),
    'recall': (mean(recList)),
    'precision': (mean(precList)),
    'f1_score': (mean(f1List)),
    'created_at': datetime.fromtimestamp(time.time())
})

accuracy :  0.9805257463387325
recall :  0.9802439686754776
precision :  0.9807975688489882
f1_score :  0.9805200601748278


<pymongo.results.InsertOneResult at 0x7f94d885f4f0>