# Feature Extraction

Flow aggregation by a time window

Generated features:
* NumSrcPorts
* NumDestAddr
* NumDestPorts
* NumFlows
* NumBytesSum
* NumBytesMean
* NumBytesVar
* NumPacketsSum
* NumPacketsMean
* NumPacketsVar

## Imports

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import glob
import time

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

import warnings

import matplotlib.pyplot as plt
import itertools

import pickle

warnings.filterwarnings("ignore")

## Functions

In [2]:
features = ["NumSrcPorts", "NumDestAddr", "NumDestPorts", "NumFlows",
                   "NumBytesSum", "NumBytesMean", "NumBytesVar",
                   "NumPacketsSum", "NumPacketsMean", "NumPacketsVar"]

def calc_confusion_matrix(y_t, y_p, encoding):
    """Calculate Confusion matrix and count hits and misses"""
    
    confusion_matrix = np.zeros((13,13)).astype(int)

    hit = 0
    miss = 0
    
    for i in range(len(y_t)):
        if encoding == 'ohe':
            pred = y_p[i].argmax()
            truth = y_t[i].argmax()
        else:
            pred = y_p[i]-1
            truth = y_t[i]-1
            
        if pred == truth:
            confusion_matrix[pred, pred] += 1
            hit += 1
        else:
            confusion_matrix[truth, pred] += 1
            miss += 1
    return confusion_matrix, hit, miss

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    http://scikit-learn.org/stable/auto_examples/
    model_selection/plot_confusion_matrix.html
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    #print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def load_pickle_data(scenario = -1):
    if scenario == -1:
        p_filename = "anomaly_cache2/anomaly_feat_list.p"
    else:
        p_filename = "anomaly_cache2/anomaly_scenario_features_"+str(scenario)+".p"
        
    return pickle.load(open(p_filename, "rb" ) )

## Load Data

In [3]:
feat_list = load_pickle_data()
gen_feat_s_df = pd.concat(feat_list)
gen_feat_s_df.reset_index(drop=True, inplace=True)
gen_feat_s_df['IsBackground'] = 1-gen_feat_s_df['IsBotnet']
gen_feat_s_df[features[:4]] = gen_feat_s_df[features[:4]].astype(int)
gen_feat_s_df[features[4:]] = gen_feat_s_df[features[4:]].astype(float)

gen_feat_s_df.head()

Unnamed: 0,NumSrcPorts,NumDestAddr,NumDestPorts,NumFlows,NumBytesSum,NumBytesMean,NumBytesVar,NumPacketsSum,NumPacketsMean,NumPacketsVar,IsBotnet,Scenario,IsBackground
0,1,1,1,1,594.0,594.0,0.0,16.0,16.0,0.0,0,1,1
1,1,1,1,1,75.0,75.0,0.0,2.0,2.0,0.0,0,1,1
2,1,1,1,1,1567.0,1567.0,0.0,10.0,10.0,0.0,0,1,1
3,1,1,1,1,560.0,560.0,0.0,4.0,4.0,0.0,0,1,1
4,1,1,1,1,76.0,76.0,0.0,2.0,2.0,0.0,0,1,1


# Machine Learning Models

We Split train and test data 70% - 30%

The reason that we have two different methods is: sklearn learning algorithms accept different kinds of inputs.

We want to feed the algorithms with one-hot encoded outputs where each scenario is represented by a different feature and is eaither one or zero. But unfortunatelly, Logistic Regression and SVM classifiers only accepts Integer Valued Encoding which creates a bias between classes.

## Generate test and train sets

In [49]:
# Integer Value Encoding for LR and SVM
def generate_train_test_ive():
    # Generate test and train sets
    #aa = pd.concat([gen_feat_s_df[gen_feat_s_df['IsBackground'] == 1].sample(10000), gen_feat_s_df[gen_feat_s_df['IsBackground'] == 0]], axis=0).reset_index(drop=True)
    
    X = gen_feat_s_df[features].values
    y = gen_feat_s_df['IsBackground'].values
    y = y.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)

    # Feature Counts
    print("Feature Counts\n")
    print("num\ttrain\ttest\ttotal")
    print("-" * 30)
    for i in range(2):
        print("{0}\t{1}\t{2}\t{3}".format(i, np.count_nonzero(y_train==i), np.count_nonzero(y_test==i), np.count_nonzero(y==i)))
    print("-" * 30)
    print("total:\t{0}\t{1}\t{2}".format(len(y_train), len(y_test), len(y)))

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print()
    
    return X_train, X_test, y_train, y_test

In [9]:
# Train all, test on all
# One-Hot Encoding for KNN and RF
def generate_train_test_ohe():
    # Generate test and train sets
    
    X = gen_feat_s_df[features].values
    y = gen_feat_s_df[['IsBackground','IsBotnet']].values
    y = y.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)


    # Scale Data
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)


    # Feature Counts
    print("Feature Counts\n")
    print("num\ttrain\ttest\ttotal")
    print("-" * 30)
    for i in range(2):
        print("{0}\t{1}\t{2}\t{3}".format(i, np.count_nonzero(y_train[:,i]), np.count_nonzero(y_test[:,i]), np.count_nonzero(y[:,i])))
    print("-" * 30)
    print("total:\t{0}\t{1}\t{2}".format(len(y_train), len(y_test), len(y)))
    print()
    
    return X_train, X_test, y_train, y_test

In [10]:
#* Train: 3 4 5 7 10 11 12 13
#* Test:  1 2 6 8 9
def generate_train_test_ohe2():
    # Generate test and train sets
    x_tr = gen_feat_s_df[(gen_feat_s_df['Scenario'] == 3) | (gen_feat_s_df['Scenario'] == 4)
                     | (gen_feat_s_df['Scenario'] == 5) | (gen_feat_s_df['Scenario'] == 7)
                     | (gen_feat_s_df['Scenario'] == 10) | (gen_feat_s_df['Scenario'] == 11)
                     | (gen_feat_s_df['Scenario'] == 12) | (gen_feat_s_df['Scenario'] == 13)]

    X_ts = []
    y_ts = []
    
    X = gen_feat_s_df[features].values
    y = gen_feat_s_df[['IsBackground','IsBotnet']].values
    y = y.astype(int)

    X_train, y_train = x_tr[features].values, x_tr[['IsBackground','IsBotnet']].values
    y_train= y_train.astype(int)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    
    for i in [1,2,6,8,9]:
        X_test = gen_feat_s_df[gen_feat_s_df['Scenario'] == i][features].values
        X_test = scaler.transform(X_test)
        
        y_test = gen_feat_s_df[gen_feat_s_df['Scenario'] == i][['IsBackground','IsBotnet']].values
        y_test = y_test.astype(int)
        
        X_ts.append(X_test)
        y_ts.append(y_test)
    
    return X_train, X_ts, y_train, y_ts

In [11]:
# Train - Test on each Scenario Seperately
def generate_train_test_ohe3():
    # Generate test and train sets
    X_tr = []
    y_tr = []
    
    X_ts = []
    y_ts = []
    
    for i in range(1,14):
        X = gen_feat_s_df[gen_feat_s_df['Scenario'] == i][features].values
        y = gen_feat_s_df[gen_feat_s_df['Scenario'] == i][['IsBackground','IsBotnet']].values
        y = y.astype(int)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)
        
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
        X_tr.append(X_train)
        y_tr.append(y_train)
        
        X_ts.append(X_test)
        y_ts.append(y_test)
    
    return X_tr, X_ts, y_tr, y_ts

## Evaluation Functions - Prints

In [47]:
def print_all_results(y_test, y_pred, encoding):
    if encoding == 'ohe':
        y_t1 = y_test[:,0]
        y_p1 = y_pred[:,0]
    else:
        y_t1 = y_test.copy()
        y_p1 = y_pred.copy()
        
    print("Anomaly Detection:")
    print("\nClassification report:")
    print(metrics.classification_report(y_t1, y_p1, target_names=["Botnet", "Background"]))

    print("\nConfusion Matrix:")
    print(metrics.confusion_matrix(y_t1, y_p1))
    
    print()


## Models - (Cross Validation)

In [44]:
def logistic_regression_cv(C=1e5, cv=5):
    log_regression = LogisticRegression(C=C)

    X_train, X_test, y_train, y_test = generate_train_test_ive()

    scores = cross_val_score(log_regression, X_train, y_train, cv=cv)
    print("CV Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
def svm_classifier_cv(kernel='rbf', C=1e5, cv=5):
    svm_clf = svm.SVC(kernel=kernel, C=C)

    X_train, X_test, y_train, y_test = generate_train_test_ive()

    scores = cross_val_score(svm_clf, X_train, y_train, cv=cv)
    print("CV Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
def knn_classifier_cv(cv=5):
    cv_scores = []
    k_vals = []
    
    X_train, X_test, y_train, y_test = generate_train_test_ohe()
    for i in range(10):
        k = 2*i+1
        knn = KNeighborsClassifier(n_neighbors=k)
        
        scores = cross_val_score(knn, X_train, y_train, cv=cv)
        print("CV Score (k = %d): %0.2f (+/- %0.2f)" % (k, scores.mean(), scores.std() * 2))
        
        cv_scores.append(scores.mean())
        k_vals.append(k)
        
    # print_results()
    plt.plot(k_vals, cv_scores)
    plt.title("CV Scores vs. k values")
    plt.xlabel("k")
    plt.ylabel("CV Score")
    plt.grid(True)
    plt.show()
    
def random_forest_cv(n_estimators=25, cv=5):
    rand_forest_clf = RandomForestClassifier(n_estimators=n_estimators)

    X_train, X_test, y_train, y_test= generate_train_test_ohe()

    scores = cross_val_score(rand_forest_clf, X_train, y_train, cv=cv)
    print("CV Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [45]:
random_forest_cv(1, 5)

Feature Counts

num	train	test	total
------------------------------
0	417107	178832	595939
1	3931	1613	5544
------------------------------
total:	421038	180445	601483

CV Score: 1.00 (+/- 0.00)


## Models - (Train -Test)

In [45]:
def logistic_regression(C=1e5):
    log_regression = LogisticRegression(C=C)

    X_train, X_test, y_train, y_test = generate_train_test_ive()

    # Train the classifier
    log_regression.fit(X_train, y_train)

    # Predict
    y_pred = log_regression.predict(X_test)

    print()
    print('*'*35)
    print_all_results(y_test, y_pred, 'ive')
    
def svm_classifier(kernel='rbf', C=5):
    svm_clf = svm.SVC(kernel=kernel, C=C)

    X_train, X_test, y_train, y_test = generate_train_test_ive()

    svm_clf.fit(X_train, y_train)

    y_pred = svm_clf.predict(X_test)

    print()
    print('*'*35)
    print_all_results(y_test, y_pred, 'ive')
    
def knn_classifier(n_neighbors, tt_set = 1):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    if tt_set == 1:
        X_train, X_test, y_train, y_test = generate_train_test_ohe()

        knn.fit(X_train, y_train)

        y_pred = knn.predict(X_test)

        print()
        print('*'*35)
        print_all_results(y_test, y_pred, 'ohe')
        
    elif tt_set == 2:
        X_train, X_test, y_train, y_test= generate_train_test_ohe2()

        knn.fit(X_train, y_train)
        
        test_order = [1,2,6,8,9]
        for i in range(len(X_test)):
            y_pred = knn.predict(X_test[i])
            
            print("Scenario " + str(test_order[i]))
            print_all_results(y_test[i], y_pred, 'ohe')
            print()
            print('*'*35)
            print()
            
    elif tt_set == 3:
        X_train, X_test, y_train, y_test= generate_train_test_ohe3()

        for i in range(len(X_train)):
            knn.fit(X_train[i], y_train[i])
            y_pred = knn.predict(X_test[i])
            
            print("Scenario " + str(i+1))
            print_all_results(y_test[i], y_pred, 'ohe')
            print()
            print('*'*35)
            print()
    
def random_forest(n_estimators=5, graph = False, tt_set = 1):
    rand_forest_clf = RandomForestClassifier(n_estimators=n_estimators)

    if tt_set == 1:
        X_train, X_test, y_train, y_test= generate_train_test_ohe()

        rand_forest_clf.fit(X_train, y_train)

        y_pred = rand_forest_clf.predict(X_test)
        
        print()
        print('*'*35)
        print_all_results(y_test, y_pred, 'ohe')
        

        importances = np.round(rand_forest_clf.feature_importances_, 3)
        print(pd.DataFrame(np.array([features, importances]).T, columns=['Features', 'Importance']))


        if graph:
            #https://stats.stackexchange.com/questions/130206/sklearn-tree-export-graphviz-values-do-not-add-up-to-samples
            from sklearn.tree import export_graphviz

            class_names = [str(i) for i in rand_forest_clf.classes_]

            export_graphviz(rand_forest_clf.estimators_[0],
                            feature_names=features,
                            class_names=class_names,
                            filled=True,
                            rounded=True)

            os.system('dot -Tpng tree.dot -o tree.png')
        
    elif tt_set == 2:
        X_train, X_test, y_train, y_test= generate_train_test_ohe2()

        rand_forest_clf.fit(X_train, y_train)
        
        test_order = [1,2,6,8,9]
        for i in range(len(X_test)):
            y_pred = rand_forest_clf.predict(X_test[i])
            
            print("Scenario " + str(test_order[i]))
            print_all_results(y_test[i], y_pred, 'ohe')
            print()
            print('*'*35)
            print()
            
    elif tt_set == 3:
        X_train, X_test, y_train, y_test= generate_train_test_ohe3()

        for i in range(len(X_train)):
            rand_forest_clf.fit(X_train[i], y_train[i])
            y_pred = rand_forest_clf.predict(X_test[i])
            
            print("Scenario " + str(i+1))
            print_all_results(y_test[i], y_pred, 'ohe')
            print()
            print('*'*35)
            print()
            
    
            

In [51]:
random_forest(n_estimators=1)

Feature Counts

num	train	test	total
------------------------------
0	417107	178832	595939
1	3931	1613	5544
------------------------------
total:	421038	180445	601483


***********************************
Anomaly Detection:

Classification report:
             precision    recall  f1-score   support

     Botnet       0.92      0.91      0.91      1613
 Background       1.00      1.00      1.00    178832

avg / total       1.00      1.00      1.00    180445


Confusion Matrix:
[[  1464    149]
 [   124 178708]]

         Features Importance
0     NumSrcPorts      0.312
1     NumDestAddr      0.074
2    NumDestPorts      0.104
3        NumFlows      0.019
4     NumBytesSum      0.066
5    NumBytesMean      0.137
6     NumBytesVar      0.002
7   NumPacketsSum      0.191
8  NumPacketsMean       0.09
9   NumPacketsVar      0.004
