In [1]:
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix
#!jupyter serverextension enable voila --sys-prefix

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import ipywidgets as widgets
from IPython.display import display, Image, HTML, clear_output 
from ipyfilechooser import FileChooser
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import glob
import os
import time
from contextlib import contextmanager
from rf_visualiser import *
from pcap_feature_parser import *
from joblib import load
from pprint import pprint
import csv

In [3]:
%%html
<style>
.jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
    flex: 0 1 100%
}
</style>

In [4]:
%matplotlib inline

global features
features = list(pd.read_csv('DNS_datastore.csv',index_col=0).columns)[8:]

# button functions

@contextmanager
def show_loading():
    loading.value = 'Loading...'
    yield
    loading.value = ''
    
@contextmanager
def show_loading2():
    loading2.value = 'Loading...'
    yield
    loading2.value = ''

@contextmanager
def show_loading3():
    loading3.value = 'Loading...'
    yield
    loading3.value = ''

    
def refresh_treetabs():
    if glob('DecisionTrees'):
        with out_treetabs:
            clear_output()
            with show_loading():
                images = glob('DecisionTrees/*.svg')
                treewidgets = [widgets.Image.from_file(img) for img in images]
                labels = ['Tree {}'.format(i+1) for i in range(len(treewidgets))]
                treetabs = widgets.Tab(layout={'justify_items':'center'})
                treetabs.children = treewidgets
                for i in range(len(treewidgets)):
                    treetabs.set_title(i,labels[i])
                display(treetabs)
        with out_fi:
            clear_output()
            if glob('Model_Metrics/Feature_Importances.png'):
                display(Image('Model_Metrics/Feature_Importances.png'))
            else:
                print('No image of feature importances was found. Try retraining the model.')
        with out_cm:
            clear_output()
            if glob('Model_Metrics/Confusion_Matrix.png'):
                display(Image('Model_Metrics/Confusion_Matrix.png', width=600))
            else:
                print('No image of the confusion matrix was found. Try retraining the model')
        with out_em:
            clear_output()
            if glob('Model_Metrics/Evaluation_Metrics.html'):
                display(HTML('Model_Metrics/Evaluation_Metrics.html'))
            else:
                print('No table of evaluation metrics was found.')
    elif 'Label' in csv.DictReader(open('DNS_datastore.csv')).fieldnames: 
        if len(pd.read_csv('DNS_datastore.csv', usecols=['Label'], squeeze=True).unique())==1:
            out_treetabs.value = 'The Random Forest algorithm needs both benign and malicious data to be trained.'
        else:
            train_build(pd.read_csv('DNS_datastore.csv', index_col=0))
    else: 
        with out_treetabs: print('The datastores are empty. Upload training data to train the model and display Decision Trees.')
        with out_fi: print('The datastores are empty. Upload training data to train the model and display Feature Importances.')

    

%matplotlib inline            
def train_build(df):
    global features
    features = list(df.columns)[8:]
    
    x = np.array(df.loc[:,features].fillna(0))
    y = np.array(df['Label'])
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)    
    rf = train_rf(x_train, y_train, features)
    
    for file in glob('DecisionTrees/*'):
        os.remove(file)
    vizdtrees(rf, x, y, features) #x_train, y_train
    
    fi = plot_fi(rf, features)
    cm = plot_cm(rf, x_test, y_test)
    

def retrain():
    if glob('DNS_datastore.csv'):
        if 'Label' in csv.DictReader(open('DNS_datastore.csv')).fieldnames:
            df = pd.read_csv('DNS_datastore.csv', index_col=0)
            if len(df['Label'].unique())==1:
                print('The model cannot be trained yet, because the datastore contains only %s samples so far. \n' \
                    % ('benign' if df['Label'].unique()==0 else 'malicious'))
            else:
                with show_loading2(): train_build(df)
    else:
        out_treetabs.value = 'The datastores are empty. Upload data first to be able to train the model.'

        
def refresh_sources():
    sources = list(pd.read_csv('DNS_datastore.csv', usecols=['Source Path'], squeeze=True).unique()) \
        if 'Source Path' in csv.DictReader(open('DNS_datastore.csv')).fieldnames else []
    dropdown_erase.options= (['ALL'] + sources if sources else ['(the datastore is empty)'])
    dropdown_view.options= (['ALL'] + sources if sources else ['(the datastore is empty)'])
    button_erase.disabled = False if sources else True
    button_view.disabled = False if sources else True
    button_backup.disabled = False if sources else True
    
    
def preview():
    out.layout.visibility = 'visible'
    with out:
        clear_output()
        with show_loading():
            try:
                df = read_pcap(fc.selected)
                if df != None:
                    print('\nYour selected '+m.value.lower()+' file looks like:\n')
                    display(df.head(10))
            except:
                print('No file is selected.')

                
def upload():
    out.layout.visibility = 'visible'
    with out:
        clear_output()
        if fc.selected == None:
            print('\nNo file selected.\n')
        elif 'Source Path' in csv.DictReader(open('DNS_datastore.csv')).fieldnames and \
        fc.selected in list(pd.read_csv('DNS_datastore.csv', usecols=['Source Path'], squeeze=True).unique()):
            print('\nA file with this path is already included in the datastore. \n')
        else:
            label = 1 if m.value=='Malicious' else 0
            start_time = time.time()
            with show_loading():
                df = update_datastore(fc.selected, label)
            print('\nYour '+ m.value.lower() +' file is successfully uploaded to the datastore. \n'+
                    'It took %.2f seconds to parse the file into useful features. \n' % (time.time()-start_time))
            refresh_sources()
            print('Now trying to train the model...')

            # only train the model if both Benign and Malicious samples are uploaded:
            if len(df['Label'].unique())==1:
                print('The model cannot be trained yet, because the data only contains %s samples. \n' \
                      % ('benign' if df['Label'].unique()==0 else 'malicious'))
            else:
                start_time = time.time()
                with show_loading():
                    train_build(df)
                    print('The Random Forest model has succesfully been updated with your data.')
                    print('This took %.2f seconds. \n' % (time.time()-start_time))

                    
def view(selected):
    out_view.clear_output()
    out_view.layout.visibility = 'visible'
    cols = ['Source Path','Label','Session','Protocol','RR type','Query Name','Payload']
    with out_view:
        if selected == 'ALL':
            display(pd.read_csv('DNS_datastore.csv', index_col=0).loc[:,cols])
        else:
            sources = pd.read_csv('DNS_datastore.csv', usecols=['Source Path'], squeeze=True)
            rows = list(sources[sources == selected].index)
            display(pd.read_csv('DNS_datastore.csv', index_col=0).loc[rows,cols])


def test_data():    
    out_test.layout.visibility = 'visible'
    if fc_test.selected == None:
        with out_test:
            clear_output()
            print('\nNo file selected.\n')
        with out_results:
            clear_output()
            print('\nNo file selected.\n')
        with out_singletree:
            clear_output()
            print('\nNo file selected.\n')
 
    else:
        global x_test, y_predict
        with out_test:
            clear_output()
            with show_loading3():
                start_time = time.time()
                pcap = read_pcap(fc_test.selected)
                testdata = parse_features(pcap, 0).drop('Label', axis=1)
                print('The file is parsed into features and is now being tested. \
                      \nIt took %.2f seconds to parse. Here is a preview of your uploaded file.' % (time.time()-start_time))
                display(testdata[['Session','RR type', 'Query Name', 'Payload']].head())
        
        if glob('trained_rf.joblib'):
            x_test = testdata.loc[:,features].fillna(0)
            rf = load('trained_rf.joblib')
            y_predict = rf.predict(x_test)
            
            with out_results:
                clear_output()
                i_malicious = [i for i in range(len(y_predict)) if y_predict[i] == 1]
                if i_malicious == []:
                    print('No anomalies were found!')
                    dropdown_tree.options = []
                    dropdown_packet.options = []
                    button_singletree.disabled = True
                else:
                    sessions = list(pd.unique(testdata.iloc[i_malicious]['Session']))
                    IPs = [str(session.split(' > ')[1].split(':')[0]) for session in sessions]
                    sus = '\n'.join(set(IPs))
                    print_sus = 'The following sessions were marked as suspicious:\n'+sus
                    label_res = widgets.Label(value=print_sus)
                    out_res = widgets.Output()
                    with out_res: display(testdata.iloc[i_malicious])
                    acc_res = widgets.Accordion(children=[out_res], selected_index=None)
                    acc_res.set_title(0, 'View detailed list of malicious packets')

                    display(label_res)
                    display(acc_res)

                    dropdown_tree.options= [tree+1 for tree in range(rf.n_estimators)]
                    dropdown_packet.options=list(testdata.index)
                    button_singletree.disabled=False  
    

In [5]:
gv_file_path = 'DecisionTrees/'
title = widgets.HTML(value = '<h2> DNS Tunnelling Detection with Random Forest Algorithm</h2>')

logo_babcock = widgets.Image(value=open('styling/logo_Babcock.jpg', 'rb').read() ,width=60,height=60)
logo_cranfield = widgets.Image(value=open('styling/logo_Cranfield.jpg', 'rb').read() ,width=60,height=60)

tab = widgets.Tab()

if not glob('DNS_datastore.csv'):
    new = pd.DataFrame()
    new.to_csv('DNS_datastore.csv')


#----------in the first tab----------------------------------------------

descr_tab1 = widgets.HTML(value='<h3>Train the machine learning algorithm with new data</h3>'+
    '<p>In this tab, labelled training data can be added to the datastores that build the Random Forest model. \
    The more samples are added, both of benign and malicious data, the better the model becomes at detecting anomalies. \
    In this tab, the data in the datastores can also be viewed or removed.</p><br>\
    <p>Every time new data is added, the model will rebuild itself completely by sampling all the data from the updated \
    datastores. An ensemble of uncorrelated Decision Trees makes up the Random Forest through the process of bootstrap \
    aggregation: each tree randomly samples from the updated datastore with replacement, and each tree uses a random \
    subset of features, in order to maximise variation among trees and minimise correlation.</p><br>')

descr1_1 = widgets.HTML(value=\
    'Select a <code>.pcap</code> file below and specify whether it is known as malicious or benign data.<br><br>')
descr1_2 = widgets.HTML(value=\
    ' ')
descr1_3 = widgets.HTML(value=\
    'After deleting data from the datastores, unless everything is deleted, the model will be retrained. It might take a minute.<br><br>')

fc = FileChooser('data')
fc.use_dir_icons = True
fc.filter_pattern = '*.pcap'

m = widgets.RadioButtons(description='This data is',options=['Malicious', 'Benign'])

button_preview = widgets.Button(
    description=' Preview',
    button_style='primary',
    tooltip='Click to Preview',
    icon='search',
    layout={'width':'200px'})

def preview_clicked(b):
    preview()
button_preview.on_click(preview_clicked)

button_upload = widgets.Button(
    description=' Upload & Train Model',
    button_style='warning',
    tooltip='Click to Upload',
    icon='check',
    layout={'width':'200px'})

def upload_clicked(b):
    upload()
button_upload.on_click(upload_clicked)

out = widgets.Output(layout={'border': '1px solid black', 'padding': '7px', 'visibility':'hidden'})

sources = list(pd.read_csv('DNS_datastore.csv', usecols=['Source Path'], squeeze=True).unique())\
if 'Source Path' in csv.DictReader(open('DNS_datastore.csv')).fieldnames else []

dropdown_view = widgets.Dropdown(
    options = (['ALL'] + sources if sources else ['(the datastore is empty)']),
    description='Choose which file to view:',
    style={'description_width':'200px'}, layout={'width':'initial'})

button_view = widgets.Button(
    description=' View',
    button_style='primary',
    tooltip='Click to View',
    icon='search',
    layout={'width':'200px'},
    disabled = False if sources else True)

def view_clicked(b):
    view(dropdown_view.value)
button_view.on_click(view_clicked)

button_backup = widgets.Button(
    description=' Backup',
    button_style='',
    tooltip='Click to save current datastore to backup file',
    icon='download',
    layout={'width':'200px'},
    disabled = False if sources else True)

def backup_clicked(b):
    df = pd.read_csv('DNS_datastore.csv', index_col=0)
    df.to_csv('DNS_datastore_backup.csv')
    with out_view: print('Backup successful.')
button_backup.on_click(backup_clicked)

out_view = widgets.Output()
out_view.layout.visibility = 'hidden'

dropdown_erase = widgets.Dropdown(
    options = (['ALL'] + sources if sources else ['(the datastore is empty)']),
    description='Choose which file to remove:',
    style={'description_width':'200px'}, layout={'width':'initial'})

button_erase = widgets.Button(
    description=' Remove Data',
    button_style='danger',
    tooltip='Click to Erase',
    icon='warning',
    layout={'width':'200px'},
    disabled = False if sources else True)

def erase_clicked(b):
    label_erase.value = 'Loading...'
    erasefile = str(dropdown_erase.value)
    erase_datastores(erasefile)
    for file in glob('DecisionTrees/*'):
        os.remove(file)
    refresh_sources()
    refresh_treetabs()
    if erasefile == 'ALL' or dropdown_erase.value=='(the datastore is empty)':
        label_erase.value = 'All files have successfully been removed from the datastore.'
    else:
        labels = pd.read_csv('DNS_datastore.csv', usecols=['Label'], squeeze=True).unique()
        if len(labels)==1:
            label_erase.value = erasefile+' is successfully erased from the datastore.\
            The algorithm could not be retrained with only % data.'%s ('benign' if labels==0 else 'malicious')
        else:
            train_build(pd.read_csv('DNS_datastore.csv', index_col=0))
            label_erase.value = erasefile+' is successfully erased from the datastore and the algorithm is retrained.'

button_erase.on_click(erase_clicked)

button_reset = widgets.Button(
    description=' Reset to Backup',
    button_style='warning',
    tooltip='Click to Reset',
    icon='undo',
    layout={'width':'200px'},
    disabled = False if glob('DNS_datastore_backup.csv') else True)

def reset_clicked(b):
    if glob('DNS_datastore_backup.csv'):
        label_erase.value = 'Loading...'
        for file in glob('DecisionTrees/*'):
            os.remove(file)
        df = pd.read_csv('DNS_datastore_backup.csv', index_col=0)
        df.to_csv('DNS_datastore.csv')
        label_erase.value = 'The datastore is successfully restored. Now trying to retrain the model...'
        train_build(df)
        refresh_sources()
        refresh_treetabs()
        label_erase.value = 'Done.'
    else:
        label_erase.value = 'No Backup file was found.'
button_reset.on_click(reset_clicked)

label_erase = widgets.Label(value='')
loading = widgets.Label()

acc1_1 = widgets.VBox([descr1_1, fc, m, widgets.HBox([button_preview, button_upload]),loading,out])
acc1_2 = widgets.VBox([descr1_2, widgets.HBox([dropdown_view, button_view, button_backup]),out_view])
acc1_3 = widgets.VBox([descr1_3, widgets.HBox([dropdown_erase, button_erase, button_reset]), label_erase])

accordion1 = widgets.Accordion(children=[acc1_1,acc1_2,acc1_3])
accordion1.set_title(0, 'File Selection')
accordion1.set_title(1, 'View Datastores')
accordion1.set_title(2, 'Erase Datastores')

tab1 = widgets.VBox([descr_tab1, accordion1])


#----------in the second tab-------------------------------------------

descr_tab2 = widgets.HTML(value = '<h3>View the constituent Decision Trees of the Random Forest model</h3>\
            <p>Random Forests work by bootstrap aggregation: the combination of multiple randomly created \
            Decision Trees is useful to limit overfitting as well as errors due to bias as compared to an \
            individual Decision Tree. However, individual trees are easier to understand; therefore all the \
            trees that make up the forest of the current model are displayed on this page. </p><br>\
            <p>Decision Trees are also useful to determine the importance of features for a machine learning algorithm; \
            these are displayed in a bar chart below. Lastly, the performance of the model is visualised in a \
            confusion matrix, and the evaluation metrics that are derived from it can be found in the last dropdown. </p><br>')

descr2_1 = widgets.HTML(value=\
            'Browse through these tabs to see all the individual trees that make up the Random Forest model. \
            together they vote on how each packet should be classified.<br><br>')
descr2_2 = widgets.HTML(value=\
            'Impurity is a measure for the probability of classifying a datapoint incorrectly after splitting on that feature. \
            Here, the feature importances are computed as the mean and standard deviation of accumulation of the \
            impurity decrease within each tree.<br><br>')
descr2_3 = widgets.HTML(value=\
            'The confusion matrix shows how accurate the current model is, as verified with a part of data from the datastore. \
            A high number of true positives (TP) and true negatives (TN) as opposed to false positives (FP) and false negatives (FN) indicates a \
            good accuracy. All evaluation metrics are calculated from this matrix, as displayed in the table.<br><br>')

button_retrain = widgets.Button(
    description=' Retrain Model',
    button_style='primary',
    tooltip='Click to Retrain',
    icon='sitemap',
    layout={'width':'200px'})

def retrain_clicked(b):
    retrain() 
    refresh_treetabs()
button_retrain.on_click(retrain_clicked)

loading2 = widgets.Label()
out_treetabs = widgets.Output()
out_fi = widgets.Output()
out_cm = widgets.Output()
out_em = widgets.Output()

acc2_1 = widgets.VBox([descr2_1, out_treetabs])
acc2_2 = widgets.VBox([descr2_2, out_fi])
acc2_3 = widgets.VBox([descr2_3, widgets.HBox([out_cm, out_em])])
accordion2 = widgets.Accordion(children=[acc2_1,acc2_2,acc2_3])
accordion2.set_title(0, 'View Decision Trees')
accordion2.set_title(1, 'View Feature Importances')
accordion2.set_title(2, 'View Confusion Matrix')

tab2 = widgets.VBox([descr_tab2, button_retrain, loading2, accordion2])


#----------in the third tab----------------------------------------------

descr_tab3 = widgets.HTML(value = '<h3>Analyse unlabelled data with the current Random Forest model</h3>\
            <p>On this page, network traffic can be tested on the presence of anomalies according to the current model.</p><br>')

descr3_1 = widgets.HTML(value=\
    'Select a <code>.pcap</code> file below to test whether it contains suspicious DNS traffic.<br><br>')
descr3_2 = widgets.HTML(value=\
    '')
descr3_3 = widgets.HTML(value=\
    'Select a single packet from your file and a single tree from the forest to see the logic behind how the packet is classified. <br><br>')

fc_test = FileChooser('data')
fc_test.use_dir_icons = True
fc_test.filter_pattern = '*.pcap'

button_test = widgets.Button(
    description=' Upload & Test',
    button_style='primary',
    tooltip='Click to Upload',
    icon='upload',
    layout={'width':'200px'})

def test_clicked(b):
    test_data()
button_test.on_click(test_clicked)

loading3 = widgets.Label()

out_test = widgets.Output(layout={'border': '1px solid black', 'padding': '7px', 'visibility':'hidden'})
out_results = widgets.Output()

dropdown_packet = widgets.Dropdown(
    options = ([]),
    description='Choose which packet to analyse:',
    style={'description_width':'200px'}, layout={'width':'50'})

dropdown_tree = widgets.Dropdown(
    options = ([]),
    description='Choose which tree to follow:',
    style={'description_width':'200px'}, layout={'width':'50'})

button_singletree = widgets.Button(
    description=' Display',
    button_style='primary',
    tooltip='Click to Upload',
    icon='sitemap',
    layout={'width':'200px'},
    disabled=True)
button_singletree.add_class("left-space")
display(HTML("<style>.left-space {margin-left: 100px;}</style>"))

def singletree_clicked(b):
    with out_singletree:
        if dropdown_packet.options:
            rf = load('trained_rf.joblib')
            print('\nThe path of the single observation with index {} through tree {} is:'.format(dropdown_packet.value, dropdown_tree.value))
            display(display(singletree(rf, features, x_test.iloc[dropdown_packet.value,:].values, dropdown_tree.value)))
button_test.on_click(singletree_clicked)

out_singletree = widgets.Output()

acc3_1 = widgets.VBox([descr3_1, fc_test, button_test, loading3, out_test])
acc3_2 = widgets.VBox([descr3_2, out_results])
acc3_3 = widgets.VBox([descr3_3, widgets.HBox([dropdown_packet, dropdown_tree, button_singletree]), out_singletree])
accordion3 = widgets.Accordion(children=[acc3_1, acc3_2, acc3_3])
accordion3.set_title(0, 'File Selection')
accordion3.set_title(1, 'View Results')
accordion3.set_title(2, 'View Single Decision Tree Path')

tab3 = widgets.VBox([descr_tab3, accordion3])
    
#----------assembling the three tabs of the interface--------------------

tab.children = [tab1, tab2, tab3]
tab.set_title(0, 'Update Model')
tab.set_title(1, 'View Trees and Parameters')
tab.set_title(2, 'Analyse Traffic')

display(widgets.HBox([logo_babcock, logo_cranfield, title]))
display(tab)
refresh_treetabs()

HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xee\x00\x0eAdo…

Tab(children=(VBox(children=(HTML(value='<h3>Train the machine learning algorithm with new data</h3><p>In this…