In [5]:
import pandas as pd
import numpy as np
from pcap_feature_parser import *
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sb
import glob

In [6]:
# loading the data
df = pd.read_csv('DNS_datastore.csv', index_col=0)
features = list(df.columns)[8:]

x = np.array(df.loc[:,features].fillna(0))
y = np.array(df['Label'])

sigma = (0.03 * x.mean(axis=0))

#### Grid Search

In [None]:
 %%time

# Create the parameter grid to be searched:
    # *The choices for max depth and number of estimators are based on 
    # how much can fit in the interface while still providing clear overview.

param_grid = {
    'bootstrap': [True],
    'n_estimators': list(range(4,15)),
    'max_features': ['sqrt', 'log2'] + [x/10 for x in range(1,11)],
    'max_depth': list(range(2,6)),
    'min_samples_leaf': [x/10 for x in range(1,6)], #fraction
    'min_samples_split': [x/10 for x in range(1,6)] #fraction  
}

# Instantiate the grid search model:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), 
                           param_grid=param_grid, 
                           scoring=['precision','accuracy','f1'], 
                           refit='precision', 
                           cv=StratifiedKFold(n_splits=10, shuffle=True), 
                           n_jobs=-1, verbose=0)

# Fit the grid search to the data:
grid_search.fit(x, y)
print('\nThe parameters with the lowest FNR are\n')
pprint(grid_search.best_params_)                                   
print('')

In [None]:
rf_best = grid_search.best_estimator_

In [8]:
#from sklearn.metrics import plot_roc_curve
#plot_roc_curve(rf_best, x, y)