In [1]:
import argparse
import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
import pydotplus
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text

In [2]:
train_file = "16-09-23-labeled.csv"
test_file = "16-09-24-labeled.csv"

## Data Loading

In [3]:
feature_names = [
    "frame_len",
    "eth_type",
    "ip_proto",
    "ip_flags",
    "ipv6_nxt",
    "ipv6_opt",
    "tcp_srcport",
    "tcp_dstport",
    "tcp_flags",
    "udp_srcport",
    "udp_dstport",
    "class",
]

In [4]:
train_df = pd.read_csv(train_file, names=feature_names)
test_df = pd.read_csv(test_file, names=feature_names)
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947072 entries, 0 to 947071
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   frame_len    947072 non-null  int64 
 1   eth_type     947072 non-null  object
 2   ip_proto     947072 non-null  int64 
 3   ip_flags     947072 non-null  object
 4   ipv6_nxt     947072 non-null  int64 
 5   ipv6_opt     947072 non-null  int64 
 6   tcp_srcport  947072 non-null  int64 
 7   tcp_dstport  947072 non-null  int64 
 8   tcp_flags    947072 non-null  object
 9   udp_srcport  947072 non-null  int64 
 10  udp_dstport  947072 non-null  int64 
 11  class        947072 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 86.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799235 entries, 0 to 799234
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   frame_len    799235 non-null  int64 
 1   eth_type

## Data Preprocessing

In [5]:
def obj2int(df):
    df["eth_type"] = df["eth_type"].apply(int, base=16)
    df["ip_flags"] = df["ip_flags"].apply(int, base=16)
    df["tcp_flags"] = df["tcp_flags"].apply(int, base=16)

In [6]:
obj2int(train_df)
obj2int(test_df)

### Feature Selection

In [7]:
from sklearn.feature_selection import SelectKBest, f_classif

In [8]:
X = train_df.iloc[:, :-1]
Y = train_df["class"]
kbest = SelectKBest(f_classif, k=11)
kbest.fit(X, Y)
sc = pd.DataFrame(
    list(zip(X, kbest.scores_)), columns=["feature", "scores"]
).sort_values("scores", ascending=False)
display(sc)

Unnamed: 0,feature,scores
7,tcp_dstport,65902.683596
6,tcp_srcport,40358.782288
9,udp_srcport,26119.902242
3,ip_flags,9489.374402
8,tcp_flags,8881.530829
4,ipv6_nxt,7728.277404
1,eth_type,7691.92221
0,frame_len,7352.275173
10,udp_dstport,4659.16698
2,ip_proto,1363.429444


In [9]:
selectN = 5
selected_features = list(sc["feature"][:selectN])
print(selected_features)

['tcp_dstport', 'tcp_srcport', 'udp_srcport', 'ip_flags', 'tcp_flags']


In [10]:
train_X = train_df[selected_features]
train_Y = train_df["class"]
test_X = test_df[selected_features]
test_Y = test_df["class"]

## Model Training

In [11]:
from sklearn.model_selection import cross_val_score

### Decision Tree
#### Train

In [115]:
dt = DecisionTreeClassifier(max_depth=5)
scores = cross_val_score(dt, train_X, train_Y, cv=10, scoring="accuracy")
print(scores.mean())

0.8597499068153167


In [116]:
dt.fit(train_X, train_Y)
print(dt.score(train_X, train_Y))

0.8960807626030545


#### Predict

In [117]:
y = dt.predict(test_X)
print("Accuracy: %1.3f" % accuracy_score(test_Y, y))
print("Precision: %1.3f" % precision_score(test_Y, y, average="weighted"))
print("Recall: %1.3f" % recall_score(test_Y, y, average="weighted"))
print("F1: %1.3f\n" % f1_score(test_Y, y, average="weighted"))

Accuracy: 0.720
Precision: 0.725


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.720
F1: 0.656



### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import mode
from sklearn.ensemble.forest import _partition_estimators, _accumulate_prediction
from sklearn.tree._tree import DTYPE
from sklearn.externals.joblib import Parallel, delayed
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
import threading
from sklearn.utils.fixes import _joblib_parallel_args



In [16]:
def _parallel_helper(obj, methodname, *args, **kwargs):
    return getattr(obj, methodname)(*args, **kwargs)

def predict_majvote(_forest, X):
    """Predict class for X.

    Uses majority voting, rather than the soft voting scheme
    used by RandomForestClassifier.predict.

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        The input samples. Internally, it will be converted to
        ``dtype=np.float32`` and if a sparse matrix is provided
        to a sparse ``csr_matrix``.
    Returns
    -------
    y : array of shape = [n_samples] or [n_samples, n_outputs]
        The predicted classes.
    """
    check_is_fitted(_forest, 'n_outputs_')

    # Check data
    X = check_array(X, dtype=DTYPE, accept_sparse="csr")

    # Assign chunk of trees to jobs
    n_jobs, n_trees, starts = _partition_estimators(forest.n_estimators,
                                                    forest.n_jobs)

    # Parallel loop    
    all_preds = Parallel(n_jobs=n_jobs, verbose=forest.verbose,
                         backend="threading")(
        delayed(_parallel_helper)(e, 'predict', X, check_input=False)
        for e in forest.estimators_)

    
    # Reduce
    
    modes, counts = mode(all_preds, axis=0)
    modes = modes.astype(int)
    
    if _forest.n_outputs_ == 1:
        return _forest.classes_.take(modes[0], axis=0)
    else:
        n_samples = all_preds[0].shape[0]
        preds = np.zeros((n_samples, _forest.n_outputs_),
                         dtype=_forest.classes_.dtype)
        for k in range(_forest.n_outputs_):
            preds[:, k] = _forest.classes_[k].take(modes[:, k], axis=0)
        return preds

#### Train

In [227]:
train_new = True

In [230]:
if not train_new:
    forest = load_model('8_5/forest_large.pickle')
    Ntree = len(forest.estimators_)
else:
    Ntree = 8
    forest = RandomForestClassifier(max_depth=5, n_estimators=Ntree, max_features=None)
    scores = cross_val_score(forest, train_X, train_Y, cv=10, scoring="accuracy")
    print(scores.mean())
    forest.fit(train_X, train_Y)
    print(forest.score(train_X, train_Y))

0.8607698880108213
0.896097656777943


#### Predict by prob

In [219]:
y = forest.predict(test_X)
print("Accuracy: %1.3f" % accuracy_score(test_Y, y))
print("Precision: %1.3f" % precision_score(test_Y, y, average="weighted"))
print("Recall: %1.3f" % recall_score(test_Y, y, average="weighted"))
print("F1: %1.3f\n" % f1_score(test_Y, y, average="weighted"))

Accuracy: 0.807
Precision: 0.822
Recall: 0.807
F1: 0.783



In [232]:
if train_new:
    store_model('8_5/forest_8_5', forest)

#### Predict by vote

In [231]:
y_ = predict_majvote(forest, test_X)
print("Accuracy: %1.3f" % accuracy_score(test_Y, y_))
print("Precision: %1.3f" % precision_score(test_Y, y_, average="weighted"))
print("Recall: %1.3f" % recall_score(test_Y, y_, average="weighted"))
print("F1: %1.3f\n" % f1_score(test_Y, y_, average="weighted"))

Accuracy: 0.723
Precision: 0.732


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.723
F1: 0.663



## Output

In [221]:
def tree_2_sw_configuration(isforest, model, thres_file):

    if isforest:
        _Ntree = Ntree
    else:
        _Ntree = 1

    feature_lists = defaultdict(list)
    for j in range(_Ntree):
        threshold = model[j].tree_.threshold
        features = [selected_features[i] for i in model[j].tree_.feature]
        for i, fe in enumerate(features):
            if threshold[i] > -2:
                feature_lists[fe].append(threshold[i])

    for i in feature_lists:
        feature_lists[i] = sorted(list(set(feature_lists[i])))
        
    with open(thres_file,'w') as f:
        for i in feature_lists:
            f.write(i+'='+str(feature_lists[i])+'\n')

    rslt = defaultdict(list)
    for n in range(_Ntree):
        left = model[n].tree_.children_left
        right = model[n].tree_.children_right
        threshold = model[n].tree_.threshold
        features = [selected_features[i] for i in model[n].tree_.feature]
        value = model[n].tree_.value
        idx = np.argwhere(left == -1)[:, 0]

        def recurse(left, right, child, lineage=None):
            if lineage is None:
                lineage = [child]
            if child in left:
                parent = np.where(left == child)[0].item()
                split = "l"
            else:
                parent = np.where(right == child)[0].item()
                split = "r"

            lineage.append((parent, split, threshold[parent], features[parent]))
            if parent == 0:
                lineage.reverse()
                return lineage
            else:
                return recurse(left, right, parent, lineage)

        for j, child in enumerate(idx):
            feature_idxs = {}
            for i in selected_features:
                feature_idxs[i] = [i for i in range(len(feature_lists[i]) + 1)]

            rl = []
            for node in recurse(left, right, child):

                if len(str(node)) < 3:
                    continue
                
                if type(node) is not tuple:
                    continue
                
                i = node
                thres = float(i[2])
                id = feature_lists[i[3]].index(thres)
                if i[1] == "l":
                    while id < len(feature_lists[i[3]]):
                        if id + 1 in feature_idxs[i[3]]:
                            feature_idxs[i[3]].remove(id + 1)
                        id = id + 1
                else:
                    while id >= 0:
                        if id in feature_idxs[i[3]]:
                            feature_idxs[i[3]].remove(id)
                        id = id - 1

            for k in selected_features:
                rl.append(feature_idxs[k])

            a = list(value[node][0])
            ind = a.index(max(a))

            for combination in itertools.product(*rl):
                if not rslt[combination]:
                    rslt[combination] = [-1 for i in range(_Ntree)]
                rslt[combination][n] = ind

    return rslt

In [233]:
cfg = tree_2_sw_configuration(True, forest.estimators_,'8_5/thres')

In [234]:
output = []
for key, value in cfg.items():
    code = list(key)
    output.append(tuple(code + value))
hdrs = selected_features + [("action" + str(i)) for i in range(Ntree)]

In [235]:
num = 100000
iters = int(len(output)/num) + 1
for i in range(iters):
    left = i*num
    right = min((i+1)*num-1,len(output))
    print(left,right)
    output_df = pd.DataFrame(output[left:right], columns=hdrs)
    output_df["max_freq"] = output_df[[("action" + str(i)) for i in range(Ntree)]].agg(
        lambda x: x.mode() if x.mode().size == 1 else min(x.mode()), axis=1
    )
    output_df.to_csv('8_5/result/' + str(i) + '.csv',index=False)
    #display(output_df.head())

0 66240


In [236]:
for i in output_df.columns:
    print(i,set(list(output_df[i])))

tcp_dstport {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
tcp_srcport {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}
udp_srcport {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
ip_flags {0, 1, 2}
tcp_flags {0, 1, 2, 3}
action0 {0, 2, 3, 4}
action1 {0, 2, 3, 4}
action2 {0, 2, 3, 4}
action3 {0, 2, 3, 4}
action4 {0, 2, 3, 4}
action5 {0, 2, 3, 4}
action6 {0, 2, 3, 4}
action7 {0, 2, 3, 4}
max_freq {0, 2, 3, 4}


In [237]:
10*23*24*3*4

66240

In [25]:
print(test_Y.value_counts())

4    435365
3    163246
2    118184
0     76045
1      6395
Name: class, dtype: int64


In [26]:
print(pd.DataFrame(y_)[0].value_counts())

4    602142
3    132193
0     57732
2      7168
Name: 0, dtype: int64


In [27]:
print(output_df["max_freq"].value_counts())

4    29913
3    16233
2     8118
0     1986
Name: max_freq, dtype: int64


In [28]:
output_df.to_csv('output.csv',index=False)

## Store Model

In [206]:
import pickle 

In [208]:
def store_model(path, model):
    with open(path, 'wb') as f:
        pickle.dump(model, f)
        
def load_model(path):
    with open(path, 'rb') as f:
        model = pickle.load(f)
    return model

In [209]:
print(len(forest.estimators_))

8
