In [3]:
import pandas as pd
import numpy as np
from database.db_api import db_api
db = db_api()

Setting database to ip: 127.0.0.1, port 3306


In [4]:
def show_msus(msus):
    for msu in msus:
        print "Id: {msu.msu}, Type: {msu.msu_type.name}".format(msu = msu)
msus = db.get_items('msus')
show_msus(msus)

Connected to database
Id: 10, Type: socket
Id: 11, Type: socket
Id: 12, Type: socket
Id: 13, Type: http
Id: 14, Type: http
Id: 15, Type: http
Id: 16, Type: read
Id: 17, Type: read
Id: 18, Type: read
Id: 19, Type: read
Id: 20, Type: read
Id: 21, Type: read
Id: 22, Type: read
Id: 23, Type: read
Id: 24, Type: read
Id: 25, Type: regex
Id: 26, Type: regex
Id: 27, Type: regex
Id: 28, Type: regex
Id: 29, Type: regex
Id: 30, Type: regex
Id: 31, Type: regex_route
Id: 32, Type: regex_route
Id: 33, Type: regex_route
Id: 34, Type: write
Id: 35, Type: write
Id: 36, Type: write


In [5]:
import json
EVENT_TYPES = 'client', 'tls_reneg', 'slowloris', 'redos'
def read_event_log(event_log_file,
                   columns=('time_', 'status_', 'name_')):
    event_log = json.load(open(event_log_file))
    events = pd.DataFrame(event_log, columns=columns)
    events = events.rename(columns={c: c.strip("_") for c in columns})
    events = events[events.name.isin(EVENT_TYPES)]
    events.time *= 1e9
    return events

def get_traffic_times(events):
    start_events = events[events.status == 'start'].reset_index()
    end_events = events[events.status == 'end'].reset_index()
    
    timeframes = pd.concat((start_events.name, 
                            start_events.time.rename("start"), 
                            end_events.time.rename('end')),
                           axis=1)
    return timeframes
    
events = read_event_log('/home/iped/datasets/all_attacks_noramp/event_log.json')
traffic = get_traffic_times(events)
print traffic

        name         start           end
0     client  1.513708e+18  1.513709e+18
1  slowloris  1.513708e+18  1.513709e+18
2  tls_reneg  1.513709e+18  1.513709e+18
3      redos  1.513709e+18  1.513709e+18


In [6]:
def get_msu(msu_id):
    return [msu for msu in msus if msu.msu == msu_id][0]

def get_msu_type(msu_id):
    return get_msu(msu_id).msu_type.name

def sample_msu_ids_by_type():
    types = set([msu.msu_type.name for msu in msus])
    ids = [[m.msu for m in msus if m.msu_type.name == t][0] for t in types]
    return ids

def res_round(x, resolution):
    return ((x / resolution).round() * resolution).astype(x.dtype)

def get_msu_df(msu):
    print "Getting dataframe for msu {msu.msu} ({msu.msu_type.name})".format(msu=msu)
    df = db.get_msu_full_df(msu)
    trange = (max(df.TIME) - min(df.TIME)) * 1e-9
    spp = round(trange / len(df), 2)
    print "\n # Points: {}\n Time range: {} seconds\n Points / second: ~{}".format(
        len(df), trange, 1/spp)

    rounded_time = res_round(df.TIME, spp * 1e9)

    epoch = ((rounded_time - min(rounded_time)) / (spp * 1e9)).astype(int)
    df = df.assign(msu_id = msu.msu)
    df = df.assign(epoch = epoch)
    df = df.set_index('TIME')
    df = df.assign(TIME = df.index)
    return df    

AGGREGATE_STAT_TYPES = (
 "ERROR_COUNT",
 "MSU_USER_TIME", "MSU_KERNEL_TIME",
 "MSU_MINOR_FAULTS", "MSU_MAJOR_FAULTS",
 "MSU_VOL_CTX_SW","MSU_INVOL_CTX_SW",
)

def make_numeric(df):
    for x in df.columns:
        df[x] = pd.to_numeric(df[x])

def convert_to_rates(msu, types=AGGREGATE_STAT_TYPES):
    for t in types:
        df[t] = df[t].diff() / df.TIME.diff()

MSU_IDS = sample_msu_ids_by_type()
        
df = pd.concat([get_msu_df(get_msu(i)) for i in MSU_IDS])
make_numeric(df)
convert_to_rates(df)

Getting dataframe for msu 25 (regex)
Connected to database
Connected to database

 # Points: 2728
 Time range: 1363.83775323 seconds
 Points / second: ~2.0
Getting dataframe for msu 13 (http)
Connected to database
Connected to database

 # Points: 2728
 Time range: 1363.83775323 seconds
 Points / second: ~2.0
Getting dataframe for msu 10 (socket)
Connected to database
Connected to database

 # Points: 2728
 Time range: 1363.83775323 seconds
 Points / second: ~2.0
Getting dataframe for msu 16 (read)
Connected to database
Connected to database

 # Points: 2727
 Time range: 1363.83775323 seconds
 Points / second: ~2.0
Getting dataframe for msu 34 (write)
Connected to database
Connected to database

 # Points: 2728
 Time range: 1363.83775323 seconds
 Points / second: ~2.0
Getting dataframe for msu 31 (regex_route)
Connected to database
Connected to database

 # Points: 2728
 Time range: 1363.83775323 seconds
 Points / second: ~2.0


In [7]:
def label_with_traffic(df, traffic):
    mask = (df.TIME > min(traffic.start)) & (df.TIME < max(traffic.end))
    
    print "Filtering out {} pre/post-traffic points".format(np.sum(~mask))
    print "{} points remaining".format(np.sum(mask))
    df = df[mask]
    print "Range of time: {} seconds ".format((max(df.TIME) - min(df.TIME))*1e-9)
    
    ordered = traffic.sort_values(by='start')
    df = df.assign(traffic='')
    for i, attack in ordered.iterrows():
        df.loc[(df.TIME > attack.start) & (df.TIME < attack.end), 'traffic'] = attack['name']
    return df

df = label_with_traffic(df, traffic)



Filtering out 270 pre/post-traffic points
16097 points remaining
Range of time: 1341.32959249 seconds 


In [8]:
%matplotlib notebook
import matplotlib.pyplot as plt 
def plot_metrics(df, metrics=AGGREGATE_STAT_TYPES, edgecolor='k', markersize=5):
    fig, ax = plt.subplots(len(metrics), len(metrics), figsize=(12,12))
    traffics = pd.unique(df.traffic.sort_values(ascending=False))
    for j, m1 in enumerate(metrics):
        for i, m2 in enumerate(metrics):
            for traffic in traffics:
                group = df[df.traffic == traffic]
                if i != j:
                    ax[i][j].plot(group[m1], group[m2], 'o', markeredgecolor=edgecolor, 
                                  markersize=markersize,label=traffic)
                else:
                    ax[i][j].hist(group[m1], 25, label=traffic)
            ax[i][j].ticklabel_format(style='sci', axis='both', scilimits=(2,2))
            if i == len(metrics)-1:
                ax[i][j].set_xlabel(m1, fontsize=8)
            else:
                ax[i][j].set_xticks([])
            if j == 0:
                ax[i][j].set_ylabel(m2, fontsize=8)
            elif j != 1 or i != 0:
                ax[i][j].set_yticks([])
                    
            if i == 0 and j == 0:
                ax[i][j].legend()
                
    plt.savefig("metrics.png")
PLOTTED_STAT_TYPES = (
 "QUEUE_LEN", "ERROR_COUNT", "NUM_STATES",
 "MSU_USER_TIME", "MSU_MINOR_FAULTS", 
 "MSU_INVOL_CTX_SW",
)        
            
plot_metrics(df, PLOTTED_STAT_TYPES)

<IPython.core.display.Javascript object>

In [10]:
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D

def plot_metrics_3d(df, m1, m2, m3, title=None):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    traffics = pd.unique(df.traffic.sort_values(ascending=False))
    for traffic in traffics:
        group = df[df.traffic == traffic]
        ax.scatter(group[m1], group[m2], group[m3], label=traffic)
    ax.legend()
    ax.set_xlabel(m1)
    ax.set_ylabel(m2)
    ax.set_zlabel(m3)
    if title is not None:
        fig.title(title)
        
metrics = ['MSU_USER_TIME', 'NUM_STATES', 'MSU_INVOL_CTX_SW']
        
#for msu_id in df.msu_id.unique():
#    plot_metrics_3d(df[df.msu_id == msu_id], *metrics)
    
plot_metrics_3d(df, *metrics)

<IPython.core.display.Javascript object>

In [11]:
%matplotlib notebook
import matplotlib.pyplot as plt

def combine_msus(df, metrics=None):
    msu_ids = np.unique(df.msu_id)
    cols = df.columns
    if metrics is None:
        stats = cols[(cols != 'TIME') & (cols != 'msu_id') & (cols != 'epoch')]
    else:
        stats = list(metrics) + ['traffic']
            
    tuples = [(mid, st) for mid in msu_ids for st in stats]
    new_cols = ['%s_%d' % (st, mid) for mid, st in tuples]
    new_df = pd.DataFrame(index=df.epoch.unique(), columns=new_cols)
    
    for (mid, st), col in zip(tuples, new_cols):
        msu_vals = df[df.msu_id == mid]
        vals = msu_vals.set_index('epoch')[[st]]
        new_df[[col]] = vals
    
    return new_df[~new_df.isnull().any(1)]

def get_msu_columns(df, msu):
    cols = []
    for c in df.columns:
        if c.split('_')[-1] == str(msu):
            cols.append(c)
    return cols
    
def iter_msu_columns(df):
    cols = df.columns
    msus = set(c.split('_')[-1] for c in cols)
    for msu in msus:
        yield get_msu_columns(df, msu)
    
    
def separate_msus(df):
    cols = df.columns
    msu_ids = set(c.split('_')[-1] for c in cols)
    stats = set('_'.join(c.split('_')[:-1]) for c in cols)
    new_cols = list(stats)
    new_cols.append('msu_id')
    new_df = pd.DataFrame(columns = new_cols)
    
    for mid in msu_ids:
        msu_df = pd.DataFrame(columns = new_cols)
        for stat in stats:
            orig_name = '%s_%s' % (stat, mid)
            stat_msu_df = df[[orig_name]]
            stat_msu_df = stat_msu_df.rename(columns={orig_name: stat})
            stat_msu_df = stat_msu_df.assign(msu_id = mid)
            msu_df[[stat, 'msu_id']] = stat_msu_df
        
        new_df = pd.concat([new_df, msu_df])
    return new_df
    
keepers = ('NUM_STATES', "MSU_USER_TIME")
    
combo_df = combine_msus(df)


In [41]:
%matplotlib notebook
from sklearn.cluster import DBSCAN
from sklearn import preprocessing

def prep_for_dbscan(X, to_keep=None):
    cols = X.columns
    x1 = X[[c for c in cols if not c.startswith('traffic')]]
    if to_keep is not None:
        x1 = X[[c for c in cols if any(c.startswith(k) for k in to_keep)]]
        
    min_max_scaler = preprocessing.StandardScaler()

    for field in x1.columns:
        x1 = x1.assign(**{field: min_max_scaler.fit_transform(x1[field].values.reshape(-1,1))})
    return x1

def datacols(X):
    return [c for c in X.columns if c != 'msu_id' and c != 'traffic']

def do_dbscan_cluster(X_in, to_consider=None, to_plot=('MSU_USER_TIME', 'NUM_STATES'),
                      min_samples=200, eps=.2, do_plot=True, separate_attacks=False):
    X = prep_for_dbscan(X_in, to_consider)
    db = DBSCAN(min_samples=min_samples, eps=eps).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    # Black removed and is used for noise instead.
    unique_labels = list(set(labels))
    colors = ['mediumblue'] + [tuple(plt.cm.Spectral(each))
              for each in np.linspace(0, 1, len(unique_labels))][1:]
    
    if do_plot:
        if separate_attacks:
            fig, ax = plt.subplots(2, 2, figsize=(9, 9))
            ax = [ax[0, 0], ax[0,1], ax[1,0], ax[1,1]]
        else:
            fig, ax = plt.subplots(1, 2, figsize=(9, 5))
    
    separated = separate_msus(X_in)
    traffic_types = np.unique(separated.traffic)
    
    client_as_class = []
    client_as_noise = 0
    attack_as_class = []
    attack_as_noise = 0
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'maroon'

        class_member_mask = (labels == k)
        
        if separate_attacks:
            client_types = range(len(traffic_types))
        else:
            client_types = (0, 1)
        
        for is_client in client_types:
            xy = separate_msus(X_in[class_member_mask & core_samples_mask])
            
            if separate_attacks:
                xy = xy[xy.traffic == traffic_types[is_client]]
            else:
                xy = xy[(xy.traffic == 'client') ^ (not is_client)]
           
            if do_plot:
                ax[is_client].plot(xy[to_plot[0]], xy[to_plot[1]], 'o', markerfacecolor=col,
                                   markeredgecolor='k', markersize=14)
            
            if is_client and k != -1:
                client_as_class.append(len(xy))
            elif is_client and k == -1:
                client_as_noise += len(xy)
            elif not is_client and k != -1:
                attack_as_class.append(len(xy))
            elif not is_client and k == -1:
                attack_as_noise += len(xy)

            xy = separate_msus(X_in[class_member_mask & ~core_samples_mask])
            
            if separate_attacks:
                xy = xy[xy.traffic == traffic_types[is_client]]
            else:
                xy = xy[(xy.traffic == 'client') ^ (not is_client)]
           
            cols = datacols(xy)
            if do_plot:
                ax[is_client].plot(xy[to_plot[0]], xy[to_plot[1]], 'o', markerfacecolor=col,
                                   markeredgecolor='k', markersize=6)

            if is_client and k != -1:
                client_as_class[-1] += len(xy)
            elif is_client and k == -1:
                client_as_noise += len(xy)
            elif not is_client and k != -1:
                attack_as_class[-1] += len(xy)
            elif not is_client and k == -1:
                attack_as_noise += len(xy)
                
    if do_plot:
        if separate_attacks:
            for i, t in enumerate(traffic_types):
                ax[i].set_title(t)
        else:
            ax[0].set_title('Attack traffic')
            ax[1].set_title('Normal traffic')
            for i in range(2):
                ax[i].set_xlabel(to_plot[0])
                ax[i].set_ylabel(to_plot[1])
    x_sep = separate_msus(X_in)
    total_client = (x_sep.traffic == 'client').sum()
    total_attack = (x_sep.traffic != 'client').sum()
    
    if do_plot:
        for c in client_as_class:
            print "Client class: {}/{}: {:.1f}%".format(c, total_client, 100.0 * c / total_client)
        for c in attack_as_class:
            print "Attack class: {}/{}: {:.1f}%".format(c, total_attack, 100.0 * c / total_attack)
        print "Client noise: {}/{}: {:.1f}%".format(client_as_noise, total_client, 100.0 * client_as_noise / total_client)
        print "Attack noise: {}/{}: {:.1f}%".format(attack_as_noise, total_attack, 100.0 * attack_as_noise / total_attack)
        
    client_cats = client_as_class + [client_as_noise]
    attack_cats = attack_as_class + [attack_as_noise]
    
    true_client = 0
    true_attack = 0
    
    for c, a in zip(client_cats, attack_cats):
        if float(c)/total_client > float(a)/total_attack:
            true_client += c
        else:
            true_attack += a
   
    print float(true_client) / total_client, 1 - (float(true_attack) / total_attack)
    return float(true_client) / total_client, 1 - (float(true_attack) / total_attack)

In [44]:
import itertools
def search_dbscan_space(df, stat_types, eps, samples, min_stat_types=1):
    st_combos = []
    for i in range(min_stat_types, len(stat_types)+1):
        st_combos.extend(itertools.combinations(stat_types, i))
    params = [(st, ep, sa) for st in st_combos for ep in eps for sa in samples]
    output = []
    print "Searching space with %d coordinates" % len(params)
    for st,  ep, sa in params:
        res = do_dbscan_cluster(df, st, eps=ep, min_samples = sa, do_plot = False)
        output.append(((st, ep, sa), res[0] - res[1]))
        print st, ep, sa, res[0] - res[1]
    return output
    
outputs = search_dbscan_space(combo_df, (
     "QUEUE_LEN", "ERROR_COUNT", "NUM_STATES",
     "MSU_USER_TIME", "MSU_MINOR_FAULTS", 
     "MSU_INVOL_CTX_SW",
    ), (1, 2), (20,), 4)    


Searching space with 44 coordinates
0.85141903172 0.0105616898704
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_USER_TIME') 1 20 0.840857341849
0.923205342237 0.895823331733
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_USER_TIME') 2 20 0.027382010504
0.921535893155 0.0124819971195
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS') 1 20 0.909053896036
0.954924874791 0.934709553529
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS') 2 20 0.0202153212628
0.69449081803 0.011521843495
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_INVOL_CTX_SW') 1 20 0.682968974535
0.926544240401 0.884301488238
('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_INVOL_CTX_SW') 2 20 0.0422427521625
0.771285475793 0.452232357177
('QUEUE_LEN', 'ERROR_COUNT', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS') 1 20 0.319053118616
0.948247078464 0.866058569371
('QUEUE_LEN', 'ERROR_COUNT', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS') 2 20 0.082188509093
0.410684474124 0.284205472876
('QUEUE_LEN', 'ERROR_COUNT',

In [45]:
from pprint import pprint
for x in sorted(outputs, key=lambda o: o[1], reverse=True):
    print x[0], x[1]

(('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS'), 1, 20) 0.909053896036
(('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_USER_TIME'), 1, 20) 0.840857341849
(('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_INVOL_CTX_SW'), 1, 20) 0.682968974535
(('ERROR_COUNT', 'NUM_STATES', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS'), 1, 20) 0.646972029715
(('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS'), 1, 20) 0.645302580633
(('QUEUE_LEN', 'NUM_STATES', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS'), 1, 20) 0.643382273384
(('ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS', 'MSU_INVOL_CTX_SW'), 1, 20) 0.406320503768
(('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS', 'MSU_INVOL_CTX_SW'), 1, 20) 0.404651054686
(('QUEUE_LEN', 'NUM_STATES', 'MSU_MINOR_FAULTS', 'MSU_INVOL_CTX_SW'), 1, 20) 0.402730747437
(('QUEUE_LEN', 'ERROR_COUNT', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS'), 1, 20) 0.319053118616
(('QUEUE_LEN', 'NUM_STATES', 'MSU_USER_TIME', 'MSU_INVOL_CTX_SW'), 1, 20) 0

In [396]:
erange = (.7, 1.6)
es = np.arange(erange[0], erange[1], .1)
ss = np.arange(10, 100, 10)
srange = (min(ss), max(ss))

vals = np.array([[0.0 for _ in es] for _ in ss])

print vals.size

for i, e in enumerate(es):
    for j, s in enumerate(ss):
         x = do_dbscan_cluster(combo_df, ('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS', 'MSU_INVOL_CTX_SW'), eps=e, min_samples=s, do_plot=False)
         vals[j,i] = x[0] - x[1]
         print '-----', e, s, vals[j,i]

81
0.648333333333 0.145524174246
----- 0.7 10 0.502809159087
0.575 0.112015318334
----- 0.7 20 0.462984681666
0.53 0.091910004787
----- 0.7 30 0.438089995213
0.483333333333 0.0866443274294
----- 0.7 40 0.396689005904
0.481666666667 0.082336045955
----- 0.7 50 0.399330620712
0.471666666667 0.0785064624222
----- 0.7 60 0.393160204244
0.45 0.0732407850646
----- 0.7 70 0.376759214935
0.435 0.0670177118238
----- 0.7 80 0.367982288176
0.426666666667 0.0636668262326
----- 0.7 90 0.362999840434
0.728333333333 0.214456677836
----- 0.8 10 0.513876655497
0.683333333333 0.166586883676
----- 0.8 20 0.516746449657
0.638333333333 0.145524174246
----- 0.8 30 0.492809159087
0.605 0.130205840115
----- 0.8 40 0.474794159885
0.595 0.12589755864
----- 0.8 50 0.46910244136
0.573333333333 0.11680229775
----- 0.8 60 0.456531035583
0.555 0.109143130685
----- 0.8 70 0.445856869315
0.531666666667 0.103398755385
----- 0.8 80 0.428267911281
0.528333333333 0.101483963619
----- 0.8 90 0.426849369714
0.811666666667 0

plt.imshow(vals, extent=(erange[0], erange[1], srange[0], srange[1]), aspect = float(erange[1] - erange[0])/ (srange[1] - srange[0]))
args = np.array([[(e, s) for e in es] for s in ss])
i = np.argmax(vals)


In [429]:
erange = (3.0, 5.0)
es = np.arange(erange[0], erange[1], .25)
ss = np.arange(75, 125, 10)
srange = (min(ss), max(ss))

vals = np.array([[0.0 for _ in es] for _ in ss])
args = np.array([[(0.0,0.0) for _ in es] for _ in ss])
print vals.size

for i, e in enumerate(es):
    for j, s in enumerate(ss):
         x = do_dbscan_cluster(combo_df, eps=e, min_samples=s, do_plot=False)
         vals[j,i] = x[0] - x[1]
         args[j, i] = (e,s)
         print '-----', e, s, vals[j,i]

40
0.448333333333 0.0670177118238
----- 3.0 75 0.381315621509
0.445 0.0622307324079
----- 3.0 85 0.382769267592
0.426666666667 0.0598372426999
----- 3.0 95 0.366829423967
0.416666666667 0.0588798468167
----- 3.0 105 0.35778681985
0.4 0.0564863571087
----- 3.0 115 0.343513642891
0.566666666667 0.129727142173
----- 3.25 75 0.436939524493
0.561666666667 0.125418860699
----- 3.25 85 0.436247805968
0.555 0.111536620393
----- 3.25 95 0.443463379607
0.546666666667 0.102441359502
----- 3.25 105 0.444225307165
0.541666666667 0.0995691718526
----- 3.25 115 0.442097494814
0.65 0.186692197224
----- 3.5 75 0.463307802776
0.643333333333 0.1737673528
----- 3.5 85 0.469565980533
0.64 0.165150789852
----- 3.5 95 0.474849210148
0.63 0.162757300144
----- 3.5 105 0.467242699856
0.625 0.155098133078
----- 3.5 115 0.469901866922
0.756666666667 0.248922929631
----- 3.75 75 0.507743737035
0.745 0.237912876975
----- 3.75 85 0.507087123025
0.728333333333 0.229296314026
----- 3.75 95 0.499037019307
0.725 0.22450

In [430]:
%matplotlib notebook
plt.imshow(vals, extent=(erange[0], erange[1], srange[0], srange[1]), aspect = float(erange[1] - erange[0])/ (srange[1] - srange[0]))
i = np.argmax(vals)


<IPython.core.display.Javascript object>

In [25]:
do_dbscan_cluster(combo_df, eps=4.5, min_samples=20)

<IPython.core.display.Javascript object>

Client class: 2820/3594: 78.5%
Client class: 0/3594: 0.0%
Attack class: 7836/12498: 62.7%
Attack class: 312/12498: 2.5%
Client noise: 774/3594: 21.5%
Attack noise: 4350/12498: 34.8%
0.784641068447 0.626980316851


(0.7846410684474123, 0.6269803168506961)

In [66]:
x = do_dbscan_cluster(combo_df, ('QUEUE_LEN', 'ERROR_COUNT', 'NUM_STATES', 'MSU_MINOR_FAULTS'), eps=1.2, min_samples=200, separate_attacks=False)
print x[0] - x[1]

<IPython.core.display.Javascript object>

Client class: 3282/3594: 91.3%
Client class: 0/3594: 0.0%
Client class: 0/3594: 0.0%
Attack class: 162/12498: 1.3%
Attack class: 7326/12498: 58.6%
Attack class: 1746/12498: 14.0%
Client noise: 312/3594: 8.7%
Attack noise: 3264/12498: 26.1%
0.913188647746 0.0129620739318
0.900226573814


In [50]:
 do_dbscan_cluster(combo_df, ('NUM_STATES', 'MSU_USER_TIME', 'MSU_MINOR_FAULTS', 'MSU_INVOL_CTX_SW'), eps=1.8, min_samples=20, separate_attacks=False)

<IPython.core.display.Javascript object>

Client class: 1626/3594: 45.2%
Client class: 306/3594: 8.5%
Client class: 0/3594: 0.0%
Attack class: 4050/12498: 32.4%
Attack class: 6/12498: 0.0%
Attack class: 2538/12498: 20.3%
Client noise: 1662/3594: 46.2%
Attack noise: 5904/12498: 47.2%
0.537562604341 0.324531925108


(0.5375626043405676, 0.3245319251080173)