In [1]:
import argparse
import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
import pydotplus
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text

from sklearn.ensemble import RandomForestClassifier

from scipy.stats import mode
from sklearn.ensemble.forest import _partition_estimators, _accumulate_prediction
from sklearn.tree._tree import DTYPE
from sklearn.externals.joblib import Parallel, delayed
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
import threading
from sklearn.utils.fixes import _joblib_parallel_args

import pickle



In [2]:
def store_model(path, model):
    with open(path, 'wb') as f:
        pickle.dump(model, f)
        
def load_model(path):
    with open(path, 'rb') as f:
        model = pickle.load(f)
    return model

In [3]:
def _parallel_helper(obj, methodname, *args, **kwargs):
    return getattr(obj, methodname)(*args, **kwargs)

def predict_majvote(_forest, X):
    """Predict class for X.

    Uses majority voting, rather than the soft voting scheme
    used by RandomForestClassifier.predict.

    Parameters
    ----------
    X : array-like or sparse matrix of shape = [n_samples, n_features]
        The input samples. Internally, it will be converted to
        ``dtype=np.float32`` and if a sparse matrix is provided
        to a sparse ``csr_matrix``.
    Returns
    -------
    y : array of shape = [n_samples] or [n_samples, n_outputs]
        The predicted classes.
    """
    check_is_fitted(_forest, 'n_outputs_')

    # Check data
    X = check_array(X, dtype=DTYPE, accept_sparse="csr")

    # Assign chunk of trees to jobs
    n_jobs, n_trees, starts = _partition_estimators(forest.n_estimators,
                                                    forest.n_jobs)

    # Parallel loop    
    all_preds = Parallel(n_jobs=n_jobs, verbose=forest.verbose,
                         backend="threading")(
        delayed(_parallel_helper)(e, 'predict', X, check_input=False)
        for e in forest.estimators_)

    
    # Reduce
    
    modes, counts = mode(all_preds, axis=0)
    modes = modes.astype(int)
    
    if _forest.n_outputs_ == 1:
        return _forest.classes_.take(modes[0], axis=0)
    else:
        n_samples = all_preds[0].shape[0]
        preds = np.zeros((n_samples, _forest.n_outputs_),
                         dtype=_forest.classes_.dtype)
        for k in range(_forest.n_outputs_):
            preds[:, k] = _forest.classes_[k].take(modes[:, k], axis=0)
        return preds

In [4]:
# modify to your input file
test_file = "16-09-24-labeled.csv"

In [5]:
# modify to your features
feature_names = [
    "frame_len",
    "eth_type",
    "ip_proto",
    "ip_flags",
    "ipv6_nxt",
    "ipv6_opt",
    "tcp_srcport",
    "tcp_dstport",
    "tcp_flags",
    "udp_srcport",
    "udp_dstport",
    "class",
]

In [6]:
test_df = pd.read_csv(test_file, names=feature_names)

In [7]:
def obj2int(df):
    df["eth_type"] = df["eth_type"].apply(int, base=16)
    df["ip_flags"] = df["ip_flags"].apply(int, base=16)
    df["tcp_flags"] = df["tcp_flags"].apply(int, base=16)

obj2int(test_df)

In [42]:
selected_feature = ['tcp_dstport', 'tcp_srcport', 'udp_srcport', 'ip_flags', 'tcp_flags']
df = test_df[selected_feature]

In [12]:
forest = load_model('8_5/forest_8_5.pickle')
Ntree = len(forest.estimators_)

In [13]:
test_X = df
y_ = predict_majvote(forest, test_X)

In [58]:
df['pred'] = y_
df['orgin_idx'] = list(df.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [55]:
df['pred'].value_counts()

4    602083
3    132260
0     57724
2      7168
Name: pred, dtype: int64

In [56]:
# modify to the class sw predict
sw_pred = 0
display(df[df['pred']!=sw_pred])

Unnamed: 0,tcp_dstport,tcp_srcport,udp_srcport,ip_flags,tcp_flags,pred,orgin_idx
0,-1,-1,-1,16384,-1,4,0
1,-1,-1,-1,16384,-1,4,1
2,-1,-1,-1,16384,-1,4,2
3,-1,-1,-1,-1,-1,4,3
4,-1,-1,-1,-1,-1,4,4
...,...,...,...,...,...,...,...
799230,5228,58685,-1,16384,24,4,799230
799231,-1,-1,-1,0,-1,4,799231
799232,60757,443,-1,16384,16,4,799232
799233,1935,47747,-1,16384,16,4,799233
