In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sb
import glob

In [2]:
# loading the data

df = pd.read_csv('DNS_datastore.csv', index_col=0)
features = list(df.columns)[11:]

x = np.array(df.loc[:,features].fillna(0))
y = np.array(df['Label'])

sigma = (0.03 * x.mean(axis=0))

#### hyperparameter tuning of the RF model

In [3]:
%%time
# Create the parameter grid to be searched
# The choices for max depth and number of estimators are based on 
# how much can fit in the interface while still providing clear overview.

param_grid = {
    'bootstrap': [True],
    'n_estimators': list(range(2,15)),
    'max_features': ['sqrt', 'log2'] + [x/10 for x in range(1,11)],
    'max_depth': list(range(2,6)),
    'min_samples_leaf': [x/10 for x in range(1,5)],
    'min_samples_split': [x/10 for x in range(1,11)]    
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3, n_jobs = -1, verbose=1)

# Fit the grid search to the data
grid_search.fit(x, y)
pprint(grid_search.best_params_)

Fitting 3 folds for each of 24960 candidates, totalling 74880 fits
{'bootstrap': True,
 'max_depth': 5,
 'max_features': 0.1,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1,
 'n_estimators': 2}
Wall time: 1h 5min 58s


#### Comparing scores

In [4]:
def avg_scores(model, x=x, y=y, sigma=sigma):
    acc = []
    F1 = []
    
    for i in range(10):
        noise = np.random.normal(0, sigma, size=x.shape)
        x = x + noise
        
#         x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#         model.fit(x_train, y_train)
#         y_pred = model.predict(x_test)

#         acc.append(accuracy_score(y_test, y_pred))
#         F1.append(f1_score(y_test, y_pred))
        
        cv_results = cross_validate(model, x, y, cv=10, scoring=('accuracy', 'f1'))
        acc.append(cv_results['test_accuracy'])
        F1.append(cv_results['test_f1'])
    
    return acc, F1

In [5]:
acc_scores = pd.DataFrame()
F1_scores = pd.DataFrame()

In [6]:
%%time

# Random Forest, base model
rf_base = RandomForestClassifier()
acc_scores['RF base'], F1_scores['RF base'] = avg_scores(rf_base)

Wall time: 56min 45s


In [None]:
best_grid = {'bootstrap': True,
             'max_depth': 3,
             'max_features': 5,
             'min_samples_leaf': 0.1,
             'min_samples_split': 0.2,
             'n_estimators': 11,
             'n_jobs': -1,
             'verbose': 1}

rf_tuned = RandomForestClassifier()
rf_tuned.set_params(**best_grid)

# Random Forest, tuned model
#rf_tuned = grid_search.best_estimator_
acc_scores['RF tuned'], F1_scores['RF tuned'] = avg_scores(rf_tuned)

compare against other algorithms:

In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
acc_scores['DT'], F1_scores['DT'] = avg_scores(dt)

In [None]:
from sklearn import svm
svm = svm.SVC()
acc_scores['SVM'], F1_scores['SVM'] = avg_scores(svm)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
acc_scores['KNN'], F1_scores['KNN'] = avg_scores(neigh)

plot accuracies and F1-scores in a bar chart:

In [None]:
ind = np.arange(acc_scores.shape[1])
width = 0.05

fig, ax = plt.subplots(figsize=(10, 6))
acc_bars = ax.bar(ind-width/2, list(acc_scores.mean(axis=1)), width, yerr=list(acc_scores.std(axis=1)), label='Accuracy',
                 error_kw=dict(capsize=5, lw=0.5, capthick=0.5), color='lightskyblue', ecolor='navy')
F1_bars = ax.bar(ind+width/2, list(F1_scores.mean(axis=1)), width, yerr=list(F1_scores.std(axis=1)), label='F1-score')
plt.ylim(ymin = 0.9, ymax = 1)
ax.set_xticks(ind)
ax.set_xticklabels(acc_scores.columns)
plt.legend()

plt.tight_layout()
fig.savefig('Model_Metrics/Compared_Performances.png', dpi=300)

#### Verification of parameters with one change

In [None]:
def plot_results(grid, name, est=rf_tuned, x=x, y=y, sigma=sigma):
    
    param_name = 'param_%s' % list(grid.keys())[0]

    trainscores = pd.DataFrame()
    testscores = pd.DataFrame()
    fittime = pd.DataFrame()
    
    for i in range(10):
        # Create a model with 10-fold cross validation
        model = GridSearchCV(est=rf_tuned, param_grid=grid, cv=10, scoring='accuracy', return_train_score=True)
        
        # Add random Gaussian noise with every repetition
        noise = np.random.normal(0, sigma, size=x.shape)
        x = x + noise
        model.fit(x, y)

        # Extract information from the cross validation model
        trainscores.loc[:,i] = model.cv_results_['mean_train_score']
        testscores.loc[:,i] = model.cv_results_['mean_test_score']
        fittime.loc[:,i] = model.cv_results_['mean_fit_time']
    
    train_scores = trainscores.mean(axis=1)
    train_std    = trainscores.std(axis=1)
    
    test_scores  = testscores.mean(axis=1)
    test_std     = testscores.std(axis=1)
    
    time_mean    = fittime.mean(axis=1)
    time_std     = fittime.std(axis=1)
    
    param_values = list(model.cv_results_[param_name])
    print(param_values)
    
    
    # Plot the scores over the parameter
    plt.subplots(1, 2, figsize=(10, 6))
    plt.subplot(121)
    plt.plot(param_values, train_scores, 'b-', label = 'train')
    plt.fill_between(param_values, train_scores-train_std, train_scores+train_std, alpha=0.5, color='b')
    plt.plot(param_values, test_scores, 'g-', label = 'test')
    plt.ylim(ymin = 0.94, ymax = 1)
    plt.legend()
    plt.xlabel(name)
    plt.ylabel('Accuracy')
    plt.title('accuracy vs %s' % name)
    
    plt.subplot(122)
    plt.plot(param_values, time_mean, 'r-')
    plt.fill_between(param_values, time_mean-time_std, time_mean+time_std, alpha=0.5, color='r')
    plt.ylim(ymin = 0.0, ymax = 1.5)
    plt.xlabel(name)
    plt.ylabel('Train time (sec)')
    plt.title('Training time vs %s' % name)
    
    plt.tight_layout(pad = 4)

#### feature selection

In [None]:
# file_list = {'benign plain.pcap':0,
#              'dns2tcp tunneling.pcap':1,
#              'dnscapy tunneling.pcap':1,
#              'iodine tunneling.pcap':1,
#              'tuns_c_00000_20180330104021.pcap':1}

# for file, label in file_list.items():
#     print(label, '\t' , 'data/pcaps/'+file)



In [None]:
importances = pd.DataFrame(rf_tuned.feature_importances_, columns=['fi'], index=features)
importances['std'] = np.std([tree.feature_importances_ for tree in rf_tuned.estimators_], axis=0, ddof=1)
importances.sort_values('fi', ascending=False, inplace=True)
display(importances)

In [None]:
sel = SelectFromModel(rf_tuned)
sel.fit(x_train, y_train)
selected_feat= df.loc[:,features].columns[(sel.get_support())]
print(list(selected_feat))

In [None]:
pd.Series(sel.estimator_.feature_importances_.ravel()).hist()