In [5]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from idtree import compute_alphas, feature_ranking

import os
import pandas as pd
import pickle

### Feature selection

In [6]:
black_list = [
 'Attack',
 'DNS_QUERY_ID',
 'Dataset',
 'IPV4_DST_ADDR',
 'IPV4_DST_ADDR_INT',
 'IPV4_SRC_ADDR',
 'IPV4_SRC_ADDR_INT',
 'Label'
]

df = pd.read_parquet('../data/netflow_sample.parquet')

ranking_features = [_ for _ in list(df.columns) if _ not in black_list]

In [7]:
file_name = '../search/ranking.pkl'

if not os.path.exists(file_name):
    df = pd.read_parquet('../data/netflow_sample.parquet')
    data = df[ranking_features]
    labels = df['Attack']
    ranking_result = feature_ranking(data, labels, random_state=35, shuffle=True, train_size=0.8)
    with open(file_name, 'wb') as file:
        # Ergebnisse des Rankings (u.a. Accuracy pro Feature-Menge) werden im Ordner search gespeichert
        pickle.dump(ranking_result, file)
else:
    with open(file_name, 'rb') as file:
        ranking_result = pickle.load(file)

selected_features = ranking_result["Set"][11] # manually determined by inspection of feature importances curve
selected_features

['L7_PROTO',
 'IPV4_SRC_ADDR_EX',
 'MIN_TTL',
 'MAX_TTL',
 'TCP_FLAGS',
 'CLIENT_TCP_FLAGS',
 'L4_DST_PORT',
 'IPV4_DST_ADDR_EX',
 'TCP_WIN_MAX_IN',
 'SRC_TO_DST_SECOND_BYTES',
 'IN_BYTES',
 'OUT_BYTES']

### Data Preparation

In [8]:
df = pd.read_parquet('../data/netflow_sample.parquet')
data = df[selected_features]
labels = df['Label']
unique_labels = sorted(labels.unique())

X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

### Hyperparameter search

In [9]:
file_name = '../search/params_idtree.pkl'

if not os.path.isfile(file_name):

    ranges = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]

    parameters = {'min_samples_leaf': ranges,
                  'min_samples_split': ranges,
                  'min_impurity_decrease': ranges}

    model = DecisionTreeClassifier(random_state=35)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=35)

    search = RandomizedSearchCV(model, parameters, scoring='accuracy', n_iter=20, n_jobs=6, cv=skf, verbose=10, random_state=35)
    search.fit(X_train, y_train)

    with open(file_name, 'wb') as file:
        pickle.dump(search, file)
else:
    with open(file_name, 'rb') as file:
        search = pickle.load(file)

print(f"Best parameters for idtree: {search.best_params_}")

Best parameters for idtree: {'min_samples_split': 0.0005, 'min_samples_leaf': 5e-05, 'min_impurity_decrease': 1e-05}


In [10]:
file_name = '../search/alphas.pkl'

if not os.path.isfile(file_name):

    # use previously computed hyperparameters from randomized grid search
    model = DecisionTreeClassifier(random_state=35, **search.best_params_)
    model = model.fit(X_train, y_train)
    alphas = compute_alphas(model, X_train, y_train)

    with open(file_name, 'wb') as file:
        pickle.dump(alphas, file)
else:
    with open(file_name, 'rb') as file:
        alphas = pickle.load(file)

print(f"Best alpha for idtree: {alphas.best_params_}")

Best alpha for idtree: {'ccp_alpha': np.float64(1.2257673346055943e-05)}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Training

In [11]:
file_name = '../model/idtree_default.pkl'

if not os.path.isfile(file_name):
    model = DecisionTreeClassifier(random_state=35, criterion="gini")
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of default idtree: {model.score(X_test, y_test):.3f}.")

Mean accuracy of default idtree: 0.940.


In [12]:
file_name = '../model/idtree.pkl'

if not os.path.isfile(file_name):
    hyperparameters = search.best_params_
    for key, value in alphas.best_params_.items():
        print(key, value)
        hyperparameters[key] = value

    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)
    model = DecisionTreeClassifier(random_state=35, **hyperparameters)
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of optimized idtree: {model.score(X_test, y_test):.3f}.")

Mean accuracy of optimized idtree: 0.931.


### Validation

In [13]:
file_name = '../scores/idtree_test.pkl'

if not os.path.isfile(file_name):
    X_true, y_true = X_test, y_test
    y_pred = model.predict(X_test[model.feature_names_in_])

    with open(file_name, 'wb') as file:
        score_dict = {
            'matrix': confusion_matrix(y_true, y_pred, labels=unique_labels, normalize=None),
            'report': classification_report(y_true, y_pred, labels=unique_labels, output_dict=True, zero_division=0),
        }
        pickle.dump(score_dict, file)
else:
    with open(file_name, 'rb') as file:
        score_dict = pickle.load(file)

print(f"Accuracy of optimized xgboost on test data: {score_dict['report']['accuracy']:.3f}")
pd.DataFrame({
    'Attack':unique_labels,
    'F1-Score':[score_dict['report'][label]['f1-score'] for label in unique_labels],
})

Accuracy of optimized xgboost on test data: 0.931


Unnamed: 0,Attack,F1-Score
0,Benign,0.881922
1,Bot,0.999975
2,DDoS,0.953391
3,DoS,0.949906
4,Infiltration,0.927414
5,Injection,0.86608
6,Password,0.93852
7,Scanning,0.943304
8,XSS,0.917903


In [14]:
file_name = '../scores/idtree_fhswfcnl.pkl'

if not os.path.isfile(file_name):
    X_true = pd.read_csv('../data/fhswfcnl_labeled.csv')
    y_true = X_true['Label']

    y_pred = model.predict(X_true[model.feature_names_in_])

    with open(file_name, 'wb') as file:
        score_dict = {
            'matrix': confusion_matrix(y_true, y_pred, labels=unique_labels, normalize=None),
            'report': classification_report(y_true, y_pred, labels=unique_labels, output_dict=True, zero_division=0),
        }
        pickle.dump(score_dict, file)
else:
    with open(file_name, 'rb') as file:
        score_dict = pickle.load(file)

print(f"Accuracy of optimized xgboost on validation data: {score_dict['report']['accuracy']:.3f}")
pd.DataFrame({
    'Attack':unique_labels,
    'F1-Score':[score_dict['report'][label]['f1-score'] for label in unique_labels],
})

Accuracy of optimized xgboost on validation data: 0.975


Unnamed: 0,Attack,F1-Score
0,Benign,0.720828
1,Bot,0.0
2,DDoS,0.0
3,DoS,0.0
4,Infiltration,0.391586
5,Injection,0.0
6,Password,0.999905
7,Scanning,0.972481
8,XSS,0.0
