In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from idtree import compute_alphas, feature_ranking

import os

### Feature selection

In [2]:
black_list = [
 'Attack',
 'DNS_QUERY_ID',
 'Dataset',
 'IPV4_DST_ADDR',
 'IPV4_DST_ADDR_INT',
 'IPV4_SRC_ADDR',
 'IPV4_SRC_ADDR_INT',
 'Label'
]

df = pd.read_parquet('../data/netflow_sample.parquet')
ranking_features = [_ for _ in list(df.columns) if _ not in black_list]

In [3]:
file_name = '../search/ranking.pkl'

if not os.path.exists(file_name):
    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[ranking_features]
    labels = df['Attack']
    ranking_result = feature_ranking(data, labels, random_state=35, shuffle=True, train_size=0.8)
    with open(file_name, 'wb') as file:
        # Ergebnisse des Rankings (u.a. Accuracy pro Feature-Menge) werden im Ordner search gespeichert
        pickle.dump(ranking_result, file)
else:
    with open(file_name, 'rb') as file:
        ranking_result = pickle.load(file)

selected_features = ranking_result["Set"][11] # manually determined by inspection of feature importances curve
selected_features

Round 1 / 42
Round 2 / 42
Round 3 / 42
Round 4 / 42
Round 5 / 42
Round 6 / 42
Round 7 / 42
Round 8 / 42
Round 9 / 42
Round 10 / 42
Round 11 / 42
Round 12 / 42
Round 13 / 42
Round 14 / 42
Round 15 / 42
Round 16 / 42
Round 17 / 42
Round 18 / 42
Round 19 / 42
Round 20 / 42
Round 21 / 42
Round 22 / 42
Round 23 / 42
Round 24 / 42
Round 25 / 42
Round 26 / 42
Round 27 / 42
Round 28 / 42
Round 29 / 42
Round 30 / 42
Round 31 / 42
Round 32 / 42
Round 33 / 42
Round 34 / 42
Round 35 / 42
Round 36 / 42
Round 37 / 42
Round 38 / 42
Round 39 / 42
Round 40 / 42
Round 41 / 42
Round 42 / 42
Adding Sets.
Adding Scores


['L7_PROTO',
 'IPV4_SRC_ADDR_EX',
 'MIN_TTL',
 'MAX_TTL',
 'TCP_FLAGS',
 'CLIENT_TCP_FLAGS',
 'L4_DST_PORT',
 'IPV4_DST_ADDR_EX',
 'TCP_WIN_MAX_IN',
 'SRC_TO_DST_SECOND_BYTES',
 'IN_BYTES',
 'OUT_BYTES']

### Hyperparameter search

In [5]:
file_name = '../search/params_idtree.pkl'

if not os.path.isfile(file_name):

    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[selected_features]
    labels = df['Attack']

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

    ranges = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]

    parameters = {'min_samples_leaf': ranges,
                  'min_samples_split': ranges,
                  'min_impurity_decrease': ranges}

    model = DecisionTreeClassifier(random_state=35)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=35)

    search = RandomizedSearchCV(model, parameters, scoring='accuracy', n_iter=20, n_jobs=6, cv=skf, verbose=10, random_state=35)
    search.fit(X_train, y_train)

    with open(file_name, 'wb') as file:
        pickle.dump(search, file)
else:
    with open(file_name, 'rb') as file:
        search = pickle.load(file)

print(f"Best parameters for idtree: {search.best_params_}")

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best parameters for idtree: {'min_samples_split': 0.0005, 'min_samples_leaf': 5e-05, 'min_impurity_decrease': 1e-05}


In [6]:
file_name = '../search/alphas.pkl'

if not os.path.isfile(file_name):

    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[selected_features]
    labels = df['Attack']
    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

    # use previously computed hyperparameters from randomized grid search
    model = DecisionTreeClassifier(random_state=35, **search.best_params_)
    model = model.fit(X_train, y_train)
    alphas = compute_alphas(model, X_train, y_train)

    with open(file_name, 'wb') as file:
        pickle.dump(alphas, file)
else:
    with open(file_name, 'rb') as file:
        alphas = pickle.load(file)

print(f"Best alpha for idtree: {alphas.best_params_}")

Computing 345 alphas.
Fitting 10 folds for each of 345 candidates, totalling 3450 fits
[CV 1/10; 1/20] START min_impurity_decrease=0.0005, min_samples_leaf=1e-06, min_samples_split=1e-06
[CV 1/10; 1/20] END min_impurity_decrease=0.0005, min_samples_leaf=1e-06, min_samples_split=1e-06;, score=0.904 total time=   1.5s
[CV 9/10; 1/20] START min_impurity_decrease=0.0005, min_samples_leaf=1e-06, min_samples_split=1e-06
[CV 9/10; 1/20] END min_impurity_decrease=0.0005, min_samples_leaf=1e-06, min_samples_split=1e-06;, score=0.903 total time=   1.5s
[CV 8/10; 2/20] START min_impurity_decrease=0.01, min_samples_leaf=1e-06, min_samples_split=5e-05
[CV 8/10; 2/20] END min_impurity_decrease=0.01, min_samples_leaf=1e-06, min_samples_split=5e-05;, score=0.737 total time=   1.2s
[CV 4/10; 3/20] START min_impurity_decrease=0.005, min_samples_leaf=0.01, min_samples_split=5e-06
[CV 4/10; 3/20] END min_impurity_decrease=0.005, min_samples_leaf=0.01, min_samples_split=5e-06;, score=0.781 total time=   1.

### Training

In [16]:
file_name = '../model/idtree_default.pkl'

if not os.path.isfile(file_name):
    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[selected_features]
    labels = df['Attack']

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

    model = DecisionTreeClassifier(random_state=35, criterion="gini")
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)

else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of default idtree: {model.score(X_test, y_test):.3f}.")

Mean accuracy of default idtree: 0.940.


In [14]:
file_name = '../model/idtree.pkl'

if not os.path.isfile(file_name):
    hyperparameters = search.best_params_
    for key, value in alphas.best_params_.items():
        print(key, value)
        hyperparameters[key] = value

    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[selected_features]
    labels = df['Attack']

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)
    model = DecisionTreeClassifier(random_state=35, **hyperparameters)
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of optimized idtree: {model.score(X_test, y_test):.3f}.")

Mean accuracy of optimized idtree: 0.931.


### Validation

In [None]:
id_ = Identifier()
id_.title = "netflow_sample"
df = pd.read_parquet(f"data/{id_.title}.parquet")

# for xgboost
le = LabelEncoder().fit(df["Attack"])

all_attacks = sorted([rename_attack(x) for x in df["Attack"].unique()])

X_true = y_true = None

if data_set == "test":
_, X_true, _, y_true = train_test_split(df, df["Attack"], random_state=35, shuffle=True, train_size=0.8)
y_true = [rename_attack(x) for x in y_true]
X_true = pd.read_csv("data/transformation/fhswfcnl_labeled.csv")
y_true = X_true["Label"].apply(rename_attack_fhswfcnl)


#print_info(f"Validating {id_} on {data_set}")
model = load_model(id_)
labels = sorted(set(y_true))
y_pred = model.predict(X_true[model.feature_names_in_])

if id_.name == "xgboost":
    y_pred = le.inverse_transform(y_pred)
y_pred = [rename_attack(x) for x in y_pred]

matrix = confusion_matrix(y_true, y_pred, labels=all_attacks, normalize=None)
#print(matrix)
#save_matrix((all_attacks, matrix))

report = classification_report(y_true, y_pred, labels=labels, output_dict=True, zero_division=0)
#save_report(report)
#print(report)