# Feature Extraction

Flow aggregation by a time window

Generated features:
* NumSrcPorts
* NumDestAddr
* NumDestPorts
* NumFlows
* NumBytes
* NumPackets

## Imports

In [6]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import glob
import time
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

## Functions

In [2]:
def extract_features(input_df):
    # Take only the botnet part
    df_botnet = input_df[input_df['Label'].str.contains("flow=From-Botnet")]
    df_botnet.loc[:, "StartTime"] = pd.to_datetime(df_botnet.StartTime, format='%Y/%m/%d %H:%M:%S.%f')
    df_botnet.sort_values(by=['StartTime'], inplace=True)
    df_botnet.reset_index(drop=True, inplace=True)
#     df_botnet.head()
    
    
    # Determine the time windows
    time_windows = [0]
    for i in range(1, len(df_botnet)):
        # Find the optimal time window duration
        if (df_botnet["StartTime"][i] - df_botnet["StartTime"][time_windows[-1]]).seconds > 30:
            time_windows.append(i)

    time_windows.append(len(df_botnet)) # Added it for easier indexing             
    
    
    # Determined column keys
    column_keys = ["NumSrcPorts", "NumDestAddr", "NumDestPorts", "NumFlows", "NumBytes", "NumPackets"]
    gen_df = pd.DataFrame(None, columns=column_keys)

    # Generate the features for each time window
    for i in range(len(time_windows)-1):
        current_df = df_botnet.iloc[time_windows[i]:time_windows[i+1],:]

        group = current_df.groupby("SrcAddr")

        for address, addr_df in group:
            # TODO: Optionally source ip address can be added
#             print(address)

            num_src_ports = len(addr_df.Sport.unique())

            num_dest_addr = len(addr_df.DstAddr.unique())

            num_dest_ports = len(addr_df.Dport.unique())

            num_flows = time_windows[1] - time_windows[0]

            # TODO: SrcBytes or TotBytes?
            num_bytes = np.sum(addr_df.SrcBytes)

            num_packets = np.sum(addr_df.TotPkts)

            curr_gen_df = pd.DataFrame([[num_src_ports, num_dest_addr, num_dest_ports, num_flows, num_bytes, num_packets]], columns=column_keys)
            gen_df = gen_df.append(curr_gen_df, ignore_index=True)
            
    return gen_df


def extract_features_all(data_path):
    scenario_list = os.listdir(data_path)
#     print(scenario_list)

    feat_list = []
    
    for scenario in scenario_list:
        scenario_path = os.path.join(data_path, scenario)
        
        if os.path.isdir(scenario_path):
            flow_file_path = glob.glob(os.path.join(scenario_path, "*.binetflow"))[0]
            
            # Extract features for the current scenario
            input_df = pd.read_csv(flow_file_path)
            scenario_features = extract_features(input_df)
            
            # Append scenario label
            scenario_features.loc[:, "Scenario"] = int(scenario)
            
            feat_list.append(scenario_features)
            
    return pd.concat(feat_list)

## Load and process data

In [3]:
DATA_PATH = os.path.join("..", "ctu-13")

# start_time = time.time()
gen_feat_df = extract_features_all(DATA_PATH)
# print("--- %s seconds ---" % (time.time() - start_time))
gen_feat_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,NumSrcPorts,NumDestAddr,NumDestPorts,NumFlows,NumBytes,NumPackets,Scenario
0,2,2,2,3,780,11,1
1,10,7,6,3,15072,510,1
2,22,18,6,3,20881,659,1
3,13,13,4,3,2771,47,1
4,16,15,4,3,7261,183,1


# Machine Learning Models

## Generate test and train sets

In [4]:
# Generate test and train sets
X = gen_feat_df.iloc[:, range(0, 6)].values
y = gen_feat_df.iloc[:, 6].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)

# Feature Counts
print("Feature Counts\n")
print("num\ttrain\ttest\ttotal")
print("-" * 30)
for i in range(1, 14):
    print("{0}\t{1}\t{2}\t{3}".format(i, np.count_nonzero(y_train==i), np.count_nonzero(y_test==i), np.count_nonzero(y==i)))
print("-" * 30)
print("total:\t{0}\t{1}\t{2}".format(len(y_train), len(y_test), len(y)))

Feature Counts

num	train	test	total
------------------------------
1	355	176	531
2	262	98	360
3	1188	489	1677
4	93	42	135
5	25	10	35
6	139	60	199
7	2	2	4
8	828	362	1190
9	1858	798	2656
10	345	132	477
11	10	4	14
12	81	44	125
13	1263	547	1810
------------------------------
total:	6449	2764	9213


## Evaluation Functions

In [5]:
def print_results():
    print("\nAccuracy:")
    print(metrics.accuracy_score(y_test, y_pred))
    
    print("\nClassification report:")
    print(metrics.classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(metrics.confusion_matrix(y_test, y_pred))

## Logistic Regression

In [None]:
log_regression = LogisticRegression()

# Train the classifier
log_regression.fit(X_train, y_train)

# Predict
y_pred = log_regression.predict(X_test)

print_results()

## KNN
Tries for different k values and prints the metrics for the k value with the best accuracy

In [None]:
acc_vals = []
k_vals = []
pred_vals = []

for i in range(20):
    k = 2*i+1
    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    
    acc_vals.append(metrics.accuracy_score(y_test, y_pred))
    k_vals.append(k)
    pred_vals.append(y_pred)

# print_results()
plt.plot(k_vals, acc_vals)
plt.title("Accuracy vs. k values")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.grid(True)

# Plot the results for best k value
i_max_acc = acc_vals.index(max(acc_vals))
print("Results for k={0}".format(2*i_max_acc+1))

y_pred = pred_vals[i_max_acc]
print_results()

## 10-fold KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## 10-fold Logistic Regression

In [None]:
log_reg = LogisticRegression()
scores = cross_val_score(log_reg, X, y, cv=10)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## SVM

In [None]:
svm_clf = svm.SVC(kernel='rbf', C=1)

svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_test)

print_results()

## Random Forest

In [7]:
rand_forest_clf = RandomForestClassifier(n_estimators=10)

rand_forest_clf.fit(X_train, y_train)

y_pred = rand_forest_clf.predict(X_test)

print_results()


Accuracy:
0.951519536903039

Classification report:
             precision    recall  f1-score   support

          1       0.78      0.79      0.78       176
          2       0.66      0.62      0.64        98
          3       0.99      1.00      0.99       489
          4       0.95      0.88      0.91        42
          5       0.50      0.40      0.44        10
          6       0.96      0.92      0.94        60
          7       0.00      0.00      0.00         2
          8       0.99      0.99      0.99       362
          9       0.99      1.00      0.99       798
         10       0.95      0.95      0.95       132
         11       1.00      0.75      0.86         4
         12       1.00      1.00      1.00        44
         13       0.94      0.95      0.94       547

avg / total       0.95      0.95      0.95      2764


Confusion Matrix:
[[139  20   0   0   2   0   0   0   0   0   0   0  15]
 [ 25  61   0   0   0   0   0   0   0   0   0   0  12]
 [  0   0 487   2   

  'precision', 'predicted', average, warn_for)


## 10-fold Random Forest

In [11]:
rand_forest_clf = RandomForestClassifier(n_estimators=10)

scores = cross_val_score(rand_forest_clf, X, y, cv=10)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Accuracy: 0.93 (+/- 0.10)
