In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2OSupportVectorMachineEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators import H2ONaiveBayesEstimator
from h2o.estimators import H2ODeepLearningEstimator
from h2o.estimators import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator

In [2]:
h2o.init(ip="localhost",port=54321)
m_models = {
    "bayes":H2ONaiveBayesEstimator(),
    "glm":H2OGeneralizedLinearEstimator(nfolds = 4),
    "rf":H2ORandomForestEstimator(nfolds = 4),
    "gbm":H2OGradientBoostingEstimator(nfolds=4),
    #"svm":H2OSupportVectorMachineEstimator(),
    "xgboost":H2OXGBoostEstimator(nfolds=4),
    "ann":H2ODeepLearningEstimator(nfolds=4)
}

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,44 days 16 hours 14 mins
H2O_cluster_timezone:,Asia/Shanghai
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,7 months and 12 days !!!
H2O_cluster_name:,root
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,18.29 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


In [3]:
df_raw = pd.read_csv('/data/students/yang/Untitled Folder/all.csv')
del df_raw['Unnamed: 0']
# df_raw['detailed-label'] = df_raw['detailed-label'].replace(to_replace= '-',value= 'Benign')
df_raw['proto'] = df_raw['proto'].map({"tcp":6,"udp":17,"icmp":1})
df_raw['detailed_label'].value_counts()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


PartOfAHorizontalPortScan    145719
Benign                        19539
C&C                           14936
DDoS                          14394
Attack                         5962
C&C-Torii                        30
C&C-FileDownload                  4
FileDownload                      2
Name: detailed_label, dtype: int64

In [4]:
df_input = df_raw[(df_raw['detailed_label']=="Attack")|(df_raw['detailed_label']=="Benign")|(df_raw['detailed_label']=="C&C")|(df_raw['detailed_label']=="DDoS")]
df_input['detailed_label'].value_counts()

Benign    19539
C&C       14936
DDoS      14394
Attack     5962
Name: detailed_label, dtype: int64

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(df_input, df_input['detailed_label'], test_size=0.20)
X_train

Unnamed: 0,detailed_label,ip_tos,ip_len,ip_id,ip_flags,ip_ttl,ip_proto,tcp_sport,tcp_dport,tcp_seq,...,orig_bytes,resp_bytes,conn_state,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes
178800,Benign,0.0,67.0,33217.0,DF,64.0,17.0,,,,...,78,0,S0,-,0.0,D,2.0,134.0,0.0,0.0
200570,Benign,0.0,195.0,43417.0,DF,64.0,6.0,32806.0,80.0,3.880792e+09,...,,,,,,,,,,
190559,DDoS,0.0,1064.0,65279.0,,255.0,6.0,65279.0,80.0,4.278190e+09,...,-,-,OTH,-,0.0,C,0.0,0.0,0.0,0.0
199843,Benign,184.0,76.0,48892.0,DF,64.0,17.0,,,,...,,,,,,,,,,
90642,Attack,0.0,60.0,34832.0,DF,64.0,6.0,60497.0,22.0,1.212931e+09,...,589,2241,SF,-,0.0,ShAdDaFf,15.0,1377.0,15.0,3029.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169822,Benign,0.0,67.0,13040.0,DF,64.0,17.0,,,,...,78,0,S0,-,0.0,D,2.0,134.0,0.0,0.0
7222,Benign,0.0,60.0,2622.0,DF,64.0,6.0,47966.0,22.0,7.694421e+07,...,0,0,REJ,-,0.0,Sr,1.0,60.0,1.0,40.0
178620,C&C,0.0,60.0,20295.0,DF,64.0,6.0,53858.0,6667.0,1.598013e+09,...,62,269,S3,-,0.0,ShAdDaf,9.0,546.0,6.0,589.0
165512,C&C,0.0,60.0,21172.0,DF,64.0,6.0,37848.0,50.0,1.893652e+09,...,-,-,S0,-,0.0,S,1.0,60.0,0.0,0.0


In [6]:
def Train(df_all):
    df_h2o = h2o.H2OFrame(df_all)
    df_train, df_valid = df_h2o.split_frame(ratios=[0.85], seed=1234)
    x = df_train.columns
    y = "detailed_label"
    x.remove(y)
    df_train[y] = df_train[y].asfactor()
    df_valid[y] = df_valid[y].asfactor()
    for key in m_models:
        print("begin train ",key)
        m_models[key].train(x=x, y=y,training_frame=df_train,validation_frame=df_valid)
        print("end train ",key)

def Predict( df_all ):
    df_h2o = h2o.H2OFrame(df_all)
    x = df_h2o.columns
    y = "detailed_label"
    x.remove(y)
    df_h2o[y] = df_h2o[y].asfactor()
    ret = {}
    for key in m_models:
        pred = m_models[key].predict(df_h2o).as_data_frame()
        y_true = df_all[y].to_list()
        y_pred = pred['predict']
        ret[key] = {}
        ret[key]['y_pred'] = y_pred.to_list()
        ret[key]['y_true'] = y_true
    return ret

In [11]:
Train(X_train)
result = Predict(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
begin train  bayes
naivebayes Model Build progress: |



███████████████████████████████████████████████| (done) 100%
end train  bayes
begin train  glm
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
end train  glm
begin train  rf
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
end train  rf
begin train  gbm
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
end train  gbm
begin train  xgboost
xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%
end train  xgboost
begin train  ann
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
end train  ann
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
naivebayes prediction progress: |████████████████████████████████████████████████| (done) 100%
glm prediction progress: |



███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%


In [12]:
from sklearn.metrics import classification_report
for model in result.keys():
    y_pred = result[model]['y_pred']
    y_true = result[model]['y_true']
    print(model)
    print(classification_report(y_true, y_pred, target_names=['Attack','Benign','C&C','DDoS'],digits=6))

bayes
              precision    recall  f1-score   support

      Attack   0.726886  0.988420  0.837715      1209
      Benign   1.000000  0.002345  0.004679      3838
         C&C   0.473403  1.000000  0.642599      3017
        DDoS   0.979599  0.992422  0.985969      2903

    accuracy                       0.647579     10967
   macro avg   0.794972  0.745797  0.617740     10967
weighted avg   0.819626  0.647579  0.531754     10967

glm
              precision    recall  f1-score   support

      Attack   0.965658  0.976840  0.971217      1209
      Benign   0.992610  0.979937  0.986233      3838
         C&C   0.986593  1.000000  0.993251      3017
        DDoS   0.998619  0.996555  0.997586      2903

    accuracy                       0.989514     10967
   macro avg   0.985870  0.988333  0.987072     10967
weighted avg   0.989574  0.989514  0.989514     10967

rf
              precision    recall  f1-score   support

      Attack   1.000000  0.999173  0.999586      1209
      Be

In [13]:
for key in m_models:
    print(key)
    varimp = m_models[key].varimp(use_pandas=True)
    display(varimp)

bayes


None

glm


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,orig_bytes,1.017612,1.000000,0.172794
1,ip_len,0.619573,0.608849,0.105206
2,history.S,0.597460,0.587120,0.101451
3,ip_ttl,0.512473,0.503604,0.087020
4,conn_state.S0,0.460848,0.452872,0.078254
...,...,...,...,...
3459,missed_bytes,0.000000,0.000000,0.000000
3460,orig_pkts,0.000000,0.000000,0.000000
3461,orig_ip_bytes,0.000000,0.000000,0.000000
3462,resp_pkts,0.000000,0.000000,0.000000


rf


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,conn_state,130419.367188,1.0,0.111268
1,history,118598.65625,0.909364,0.101183
2,tcp_window,100236.328125,0.768569,0.085517
3,proto,88913.875,0.681754,0.075857
4,tcp_dport,87786.304688,0.673108,0.074895
5,tcp_sport,81293.140625,0.623321,0.069356
6,tcp_dataofs,78476.796875,0.601727,0.066953
7,tcp_flags,76353.21875,0.585444,0.065141
8,ip_len,56368.257812,0.432208,0.048091
9,resp_bytes,48232.617188,0.369827,0.04115


gbm


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,history,87119.992188,1.0,0.722079
1,tcp_flags,22914.892578,0.263027,0.189926
2,tcp_window,9453.052734,0.108506,0.07835
3,ip_len,658.901489,0.007563,0.005461
4,tcp_dport,168.538925,0.001935,0.001397
5,id.resp_p,134.931229,0.001549,0.001118
6,tcp_sport,127.383995,0.001462,0.001056
7,id.orig_p,65.610863,0.000753,0.000544
8,orig_pkts,8.344274,9.6e-05,6.9e-05
9,ip_tos,0.0,0.0,0.0


xgboost


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,tcp_window,55282.105469,1.0,0.298645
1,history.S,41529.652344,0.751231,0.224352
2,proto,38192.109375,0.690858,0.206322
3,resp_bytes,17242.314453,0.311897,0.093146
4,resp_ip_bytes,10071.81543,0.182189,0.05441
5,service.ssh,9221.107422,0.166801,0.049814
6,service.irc,8403.003906,0.152002,0.045395
7,tcp_seq,897.364441,0.016232,0.004848
8,tcp_dataofs,766.029297,0.013857,0.004138
9,conn_state.SF,752.868835,0.013619,0.004067


ann


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,conn_state.SF,1.000000,1.000000,0.001827
1,tcp_flags.nan,0.668393,0.668393,0.001221
2,tcp_flags.SN,0.607624,0.607624,0.001110
3,ip_flags.DF,0.593314,0.593314,0.001084
4,resp_pkts,0.523179,0.523179,0.000956
...,...,...,...,...
3467,udp_len.missing(NA),0.000000,0.000000,0.000000
3468,conn_state.missing(NA),0.000000,0.000000,0.000000
3469,tcp_flags.missing(NA),0.000000,0.000000,0.000000
3470,service.missing(NA),0.000000,0.000000,0.000000
