In [28]:
import numpy as np
import pandas as pd
import json,os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2OSupportVectorMachineEstimator
from h2o.estimators import H2ODeepLearningEstimator
from h2o.estimators import H2OSupportVectorMachineEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators import H2ONaiveBayesEstimator
from h2o.estimators import H2ODeepLearningEstimator
from h2o.estimators import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator

In [29]:
h2o.init(ip="localhost",port=54321)
m_models = {
    "bayes":H2ONaiveBayesEstimator(),
    "glm":H2OGeneralizedLinearEstimator(nfolds = 4),
    "rf":H2ORandomForestEstimator(nfolds = 4),
    "gbm":H2OGradientBoostingEstimator(nfolds=4),
    #"svm":H2OSupportVectorMachineEstimator(),
    "xgboost":H2OXGBoostEstimator(nfolds=4),
    "ann":H2ODeepLearningEstimator(nfolds=4)
}

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,39 days 17 hours 45 mins
H2O_cluster_timezone:,Asia/Shanghai
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,7 months and 7 days !!!
H2O_cluster_name:,root
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,19.93 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


In [30]:
df_raw = pd.read_csv('./zeek.csv')
del df_raw['ts']
del df_raw['uid']
del df_raw['id.orig_h']
del df_raw['id.resp_h']
del df_raw['tunnel_parents']
del df_raw['service']
del df_raw['local_orig']
del df_raw['label']

df_raw['detailed-label'] = df_raw['detailed-label'].replace(to_replace= '-',value= 'Benign')
df_raw['proto'] = df_raw['proto'].map({"tcp":6,"udp":17,"icmp":1})
df_raw['detailed-label'].value_counts()

PartOfAHorizontalPortScan    145719
Benign                        19525
C&C                           14936
DDoS                          14394
Attack                         5962
C&C-Torii                        30
C&C-FileDownload                  3
FileDownload                      2
Name: detailed-label, dtype: int64

In [31]:
df_input = df_raw[(df_raw['detailed-label']=="Attack")|(df_raw['detailed-label']=="Benign")|(df_raw['detailed-label']=="C&C")|(df_raw['detailed-label']=="DDoS")]
df_input['detailed-label'].value_counts()

Benign    19525
C&C       14936
DDoS      14394
Attack     5962
Name: detailed-label, dtype: int64

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(df_input, df_input['detailed-label'], test_size=0.20)
X_train

Unnamed: 0,id.orig_p,id.resp_p,proto,duration,orig_bytes,resp_bytes,conn_state,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,detailed-label
156116,123,123,17,0.001734,48,48,SF,-,0,Dd,1,76,1,76,Benign
192937,65279,80,6,-,-,-,OTH,-,0,C,0,0,0,0,DDoS
156234,43332,50,6,3.117974,0,0,S0,-,0,S,3,180,0,0,C&C
176928,123,123,17,-,-,-,S0,-,0,D,1,76,0,0,Benign
159558,44668,50,6,-,-,-,S0,-,0,S,1,60,0,0,C&C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93301,11,0,1,-,-,-,OTH,-,0,-,1,56,0,0,Benign
193134,65279,80,6,-,-,-,OTH,-,0,C,0,0,0,0,DDoS
68444,123,123,17,0.124131,48,48,SF,-,0,Dd,1,76,1,76,Benign
190957,65279,80,6,-,-,-,OTH,-,0,C,0,0,0,0,DDoS


In [33]:
def Train(df_all):
    df_h2o = h2o.H2OFrame(df_all)
    df_train, df_valid = df_h2o.split_frame(ratios=[0.85], seed=1234)
    x = df_train.columns
    y = "detailed-label"
    x.remove(y)
    df_train[y] = df_train[y].asfactor()
    df_valid[y] = df_valid[y].asfactor()
    for key in m_models:
        print("begin train ",key)
        m_models[key].train(x=x, y=y,training_frame=df_train,validation_frame=df_valid)
        print("end train ",key)

def Predict( df_all ):
    df_h2o = h2o.H2OFrame(df_all)
    x = df_h2o.columns
    y = "detailed-label"
    x.remove(y)
    df_h2o[y] = df_h2o[y].asfactor()
    ret = {}
    for key in m_models:
        pred = m_models[key].predict(df_h2o).as_data_frame()
        y_true = df_all[y].to_list()
        y_pred = pred['predict']
        ret[key] = {}
        ret[key]['y_pred'] = y_pred.to_list()
        ret[key]['y_true'] = y_true
    return ret

In [34]:
Train(X_train)
result = Predict(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
begin train  bayes
naivebayes Model Build progress: |



███████████████████████████████████████████████| (done) 100%
end train  bayes
begin train  glm
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
end train  glm
begin train  rf
drf Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%
end train  rf
begin train  gbm
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
end train  gbm
begin train  xgboost
xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%
end train  xgboost
begin train  ann
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
end train  ann
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
naivebayes prediction progress: |████████████████████████████████████████████████| (done) 100%
glm prediction progress: |



███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%


In [35]:
from sklearn.metrics import classification_report
for model in result.keys():
    y_pred = result[model]['y_pred']
    y_true = result[model]['y_true']
    print(model)
    print(classification_report(y_true, y_pred, target_names=['Attack','Benign','C&C','DDoS'],digits=6))

bayes
              precision    recall  f1-score   support

      Attack   0.983663  0.976109  0.979872      1172
      Benign   0.998570  0.703601  0.825528      3971
         C&C   0.582552  0.417199  0.486201      2977
        DDoS   0.579347  0.992264  0.731562      2844

    accuracy                       0.729843     10964
   macro avg   0.786033  0.772293  0.755791     10964
weighted avg   0.775273  0.729843  0.725517     10964

glm
              precision    recall  f1-score   support

      Attack   0.990702  1.000000  0.995329      1172
      Benign   0.990677  0.909846  0.948543      3971
         C&C   0.885414  0.988915  0.934307      2977
        DDoS   0.998932  0.986639  0.992747      2844

    accuracy                       0.960872     10964
   macro avg   0.966431  0.971350  0.967731     10964
weighted avg   0.964239  0.960872  0.961145     10964

rf
              precision    recall  f1-score   support

      Attack   0.998296  1.000000  0.999147      1172
      Be

In [37]:
for key in m_models:
    print(key)
    varimp = m_models[key].varimp(use_pandas=True)
    display(varimp)

bayes


None

glm


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,conn_state.SF,5.477256,1.000000,0.130086
1,history.C,5.117614,0.934339,0.121545
2,conn_state.OTH,4.108648,0.750129,0.097581
3,history.S,3.603069,0.657824,0.085574
4,history.-,3.249228,0.593222,0.077170
...,...,...,...,...
109,conn_state.S2,0.000000,0.000000,0.000000
110,conn_state.SH,0.000000,0.000000,0.000000
111,conn_state.SHR,0.000000,0.000000,0.000000
112,orig_bytes,0.000000,0.000000,0.000000


rf


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,history,420364.0625,1.0,0.390739
1,id.orig_p,201310.25,0.478895,0.187123
2,conn_state,154858.921875,0.368392,0.143945
3,id.resp_p,101601.226562,0.241698,0.094441
4,proto,80541.390625,0.191599,0.074865
5,resp_ip_bytes,66421.546875,0.15801,0.06174
6,resp_bytes,24911.839844,0.059263,0.023156
7,resp_pkts,22012.664062,0.052366,0.020461
8,orig_pkts,1162.718018,0.002766,0.001081
9,orig_ip_bytes,1040.612671,0.002476,0.000967


gbm


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,history,108883.335938,1.0,0.9028162
1,id.resp_p,10407.068359,0.09557999,0.08629117
2,id.orig_p,1277.064575,0.01172874,0.0105889
3,orig_pkts,31.567335,0.0002899189,0.0002617435
4,conn_state,3.144727,2.888162e-05,2.60748e-05
5,orig_ip_bytes,1.408436,1.293528e-05,1.167818e-05
6,resp_bytes,0.309832,2.845544e-06,2.569004e-06
7,duration,0.107574,9.879762e-07,8.919609e-07
8,resp_ip_bytes,0.046735,4.292206e-07,3.875073e-07
9,resp_pkts,0.036663,3.367182e-07,3.039946e-07


xgboost


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,orig_ip_bytes,51035.972656,1.0,0.2758443
1,history.S,38142.597656,0.7473669,0.2061569
2,proto,30682.765625,0.6011988,0.1658373
3,resp_bytes,27603.279297,0.5408593,0.1491929
4,id.resp_p,22043.451172,0.4319199,0.1191426
5,resp_ip_bytes,10209.879883,0.2000526,0.05518337
6,id.orig_p,2797.165771,0.05480773,0.0151184
7,conn_state.SF,1025.716797,0.02009792,0.005543896
8,conn_state.RSTO,507.952515,0.009952833,0.002745432
9,orig_bytes,252.908798,0.004955501,0.001366947


ann


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,history.C,1.000000,1.000000,0.019042
1,conn_state.SF,0.771778,0.771778,0.014696
2,proto,0.762397,0.762397,0.014518
3,id.resp_p,0.692013,0.692013,0.013178
4,history.S,0.671785,0.671785,0.012792
...,...,...,...,...
111,history.ShAdD,0.391556,0.391556,0.007456
112,history.ShAdfFa,0.391172,0.391172,0.007449
113,history.ShAdDtaFf,0.382218,0.382218,0.007278
114,history.missing(NA),0.000000,0.000000,0.000000


In [45]:
varimp = m_models['rf'].varimp(use_pandas=True)
varimp['scaled_importance'].tolist()

[1.0,
 0.4788950054454286,
 0.3683923905245825,
 0.24169817457337947,
 0.19159913467864537,
 0.15800957503354607,
 0.0592625347076381,
 0.05236571349983087,
 0.0027659786392375658,
 0.0024755034117561786,
 0.0022880760678058083,
 0.001461226701278282,
 4.283496736605035e-05]

In [54]:
f1 = [0.961145,0.994994,0.998906,0.999818,0.996082]

p = []
i = 0
for key in m_models:
    print(key)
    if key != 'bayes':
        # varimp.append(m_models[key].varimp())
        varimp = m_models[key].varimp(use_pandas=True)
        del varimp['percentage']
        del varimp['relative_importance']
        varimp['scaled_importance'] = varimp['scaled_importance'] * f1[i]
        p += varimp.values.tolist()
        i +=1 

p

bayes
glm
rf
gbm
xgboost
ann


[['conn_state.SF', 0.961145],
 ['history.C', 0.8980352696785605],
 ['conn_state.OTH', 0.7209826309103673],
 ['history.S', 0.6322639844256431],
 ['history.-', 0.5701722447529903],
 ['proto', 0.5483592890350075],
 ['history.ShAdDaFf', 0.5100801714065508],
 ['id.resp_p', 0.3513647523243964],
 ['id.orig_p', 0.321457574395354],
 ['conn_state.S0', 0.3213654900036895],
 ['conn_state.S3', 0.2903628592490466],
 ['conn_state.REJ', 0.2644587165933583],
 ['history.Sr', 0.2644587165933583],
 ['history.Dd', 0.20814011602274313],
 ['history.ShAdDaf', 0.1428929484065529],
 ['resp_bytes', 0.07770615756806408],
 ['missed_bytes', 0.05736467988748645],
 ['orig_pkts', 0.054011480735431466],
 ['resp_ip_bytes', 0.052359803249844517],
 ['resp_pkts', 0.03540644336529867],
 ['history.D', 0.03435520495867405],
 ['history.ShAdDaFRfR', 0.022958861983416623],
 ['conn_state.RSTO', 0.02071658377426368],
 ['duration', 0.015435317989673361],
 ['history.ShAdDaft', 0.012666885841346703],
 ['history.Aa', 0.0],
 ['history.

In [55]:
# a = []
# for i in p:
#     if i[0] not in a:
#         a.append(i)
#     else:
#         a.index[]
        

a=pd.DataFrame(p) 
a
a.to_csv('a.csv',index=False)