In [1]:
from pcap_feature_parser import read_pcap
import pandas as pd
import numpy as np
import math
import wordninja

In [None]:
%%time
df = pd.DataFrame()

file_list = {'benign plain.pcap':0,
             'dns2tcp tunneling.pcap':1,
             'dnscapy tunneling.pcap':1,
             'iodine tunneling.pcap':1,
             'dnscat2_dns_tunneling_1hr.pcap':1,
             'tuns_c_00000_20180330104021.pcap':1}

for file, label in file_list.items():
    df_append = read_pcap('data/pcaps/'+file)
    df_append['Label'] = [int(label) for i in df_append.index]
    df = pd.concat([df, df_append], axis=0)

df = df.reset_index()
df = df.drop(columns='index')

In [None]:
#df.insert(1, 'Label', [int(label) for i in df.index])

def splitquery(x):
    groups = x.rsplit('.',3)
    SD = groups[0].lstrip('b\'') if len(groups)==4 else ''
    TLD = '.'.join(groups[-3:-1]).lstrip('b\'')
    return SD, TLD

df[['Subdomain','Top Level Domain']] = [splitquery(query) for query in df['Query Name']]

display(df)

In [None]:
cols = ['Label','Source Path','Session','Protocol','Query Name','Subdomain','Top Level Domain','Payload','RR type']
df = df.loc[:, cols] # reorder columns
display(df)

In [None]:
%%time

def shannon(x):
    x = str(x)
    freqs = (
        float(x.count(c))/len(x)
        for c in set(x))
    return -sum((
        prob * math.log(prob, 2)
        for prob in freqs))

df['Query Name Entropy'] = [shannon(name) for name in df['Query Name']]
df['Subdomain Entropy'] = [shannon(name) for name in df['Subdomain']]
df['Payload Entropy'] = [shannon(name) for name in df['Payload']]


In [None]:
%%time
df['longest word Subdomain'] = [len(max(wordninja.split(name), key=len, default='')) for name in df['Subdomain']]

In [None]:
%%time
df['longest word Payload'] = [len(max(wordninja.split(name), key=len, default='')) for name in df['Payload']]

In [None]:

    def count(x):
        x = str(x)
        total = len(x)
        uppercase, lowercase, numeric, special = 0, 0, 0, 0
        for i in range(total):
            if x[i].isupper():
                uppercase += 1
            elif x[i].islower():
                lowercase += 1
            elif x[i].isdigit():
                numeric += 1
            else:
                special += 1
        return [total, uppercase, lowercase, numeric, special]

    df[['Character Count', 
        'Uppercase Count', 
        'Lowercase Count', 
        'Numeric Count', 
        'Special Char Count']] = [count(name) for name in df['Query Name']]
    
    df[['Subdomain Character Count', 
        'Subdomain Uppercase Count', 
        'Subdomain Lowercase Count', 
        'Subdomain Numeric Count', 
        'Subdomain Special Char Count']] = [count(name) for name in df['Subdomain']]
    
    df[['Payload Character Count', 
        'Payload Uppercase Count', 
        'Payload Lowercase Count', 
        'Payload Numeric Count', 
        'Payload Special Char Count']] = [count(name) for name in df['Payload']]

    df['Dashes Count'] = [str(name).count('-') for name in df['Query Name']]
    df['Slashes Count'] = [str(name).count('/') for name in df['Query Name']]
    df['Periods Count'] = [str(name).count('.') for name in df['Query Name']]
    df['Equal Signs Count'] = [str(name).count('=') for name in df['Query Name']]

In [None]:
%%time

def ratio(x):
    x = str(x)
    total = len(x)
    uppercase, lowercase, numeric, special = 0,0,0,0
    if total != 0:
        for i in range(total):
            if x[i].isupper():
                uppercase += 1
            elif x[i].islower():
                lowercase += 1
            elif x[i].isdigit():
                numeric += 1
            else:
                special += 1
        return [uppercase/total, lowercase/total, numeric/total, special/total]
    else:
        return 0,0,0,0

#df[['Uppercase Ratio', 'Lowercase Ratio', 'Numeric Ratio', 'Special Char Ratio']] = [ratio(name) for name in df['Query Name']]


In [None]:
%%time
df[['Payload Uppercase Ratio', 'Payload Lowercase Ratio', 'Payload Numeric Ratio', 'Payload Special Char Ratio']] = [ratio(name) for name in df['Payload']]

In [None]:
%%time
df[['Subdomain Uppercase Ratio', 'Subdomain Lowercase Ratio', 'Subdomain Numeric Ratio', 'Subdomain Special Char Ratio']] = [ratio(name) for name in df['Subdomain']]

In [None]:
%%time
df['Packets in Session'] = df.groupby('Session')['Session'].transform('count')

In [None]:
%%time

df['Avg Query Name Length (Session)'] = df.groupby('Session')['Query Name'].transform(lambda x: np.mean(x.str.len()))
df['Avg Subdomain Length (Session)'] = df.groupby('Session')['Subdomain'].transform(lambda x: np.mean(x.str.len()))


In [None]:
%%time
df['Avg Payload Length (Session)'] = df.groupby('Session')['Payload'].transform(lambda x: np.mean(x.str.len()))

In [None]:
from pprint import pprint
pprint(list(df.columns))

In [None]:
print(list(df.columns)[8:])

In [None]:
from sklearn.ensemble import RandomForestClassifier

test_grid = {'bootstrap': True,
             'n_estimators': 1000,
             'max_features': 10,
             'max_depth': 10,
             'min_samples_leaf': 0.1,
             'min_samples_split': 0.2
             }

rf = RandomForestClassifier()
rf.set_params(**test_grid)

In [None]:
import matplotlib.pyplot as plt

def plot_fi(rf, df, features):
    x = np.array(df.loc[:,features].fillna(0))
    y = np.array(df['Label'])

    rf.fit(x,y)

    importances = pd.DataFrame(rf.feature_importances_, columns=['fi'], index=features)
    importances['std'] = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0, ddof=1)
    importances.sort_values('fi', ascending=False, inplace=True)
    fig, ax = plt.subplots(figsize=(15,10))
    importances.plot.bar(yerr='std', ax=ax, error_kw=dict(capsize=5, lw=0.5, capthick=0.5), color='#0099C4', ecolor='navy')
    
    ax.set_title("Feature Importances")
    ax.set_ylabel("Mean decrease in impurity")
    ax.get_legend().remove()
    fig.tight_layout()
    
    return importances

In [None]:
features = list(df.columns)[8:]
allfeatures = plot_fi(rf, df, features)

In [None]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(rf)
sel.fit(x, y)

sel_feat = df.loc[:,features].columns[(sel.get_support())]
print(list(sel_feat))

In [None]:

selectedfeatures = plot_fi(rf, df, sel_feat)