In [2]:
import cudf
import numpy as np
import pandas as pd
import math
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.svm import SVC
from cuml import KMeans

%matplotlib inline
%load_ext autotime

# Read data

In [3]:
training_df = cudf.read_csv('../data/features/UNSW_NB15_training-set.csv')
testing_df  = cudf.read_csv('../data/features/UNSW_NB15_testing-set.csv')

### ADD IDENTIFIER
training_df['test'] = 0
testing_df['test']  = 1

df = cudf.concat([training_df, testing_df])
df = df.drop(['id']).reset_index().rename({'index': 'id'})

df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label,test
0,0,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,0,0,0,1,1,0,Normal,0,0
1,1,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,2,0,0,0,1,6,0,Normal,0,0
2,2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,3,0,0,0,2,6,0,Normal,0,0
3,3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,3,1,1,0,2,1,0,Normal,0,0
4,4,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,40,0,0,0,2,39,0,Normal,0,0


time: 1.24 s


In [4]:
to_keep = [
      'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes'
    , 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss'
    , 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat'
    , 'ct_srv_src', 'ct_state_ttl', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'
    , 'ct_srv_dst', 'attack_cat', 'label', 'test']

df = df[to_keep].reset_index(drop=True).reset_index()
df = df.rename({'index': 'id'})
df['id'] = df['id'].astype('int32')
df['attack_cat'] = df['attack_cat'].fillna('Normal')
df['attack_cat'] = df['attack_cat'].str.replace(' ', '')

time: 26.5 ms


In [5]:
df.describe()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,sttl,dttl,sload,dload,...,synack,ackdat,ct_srv_src,ct_state_ttl,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_srv_dst,label,test
count,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,...,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0,257673.0
mean,128836.0,1.246715,19.777144,18.514703,8572.952,14387.29,180.000931,84.754957,70608690.0,658214.3,...,0.023652,0.022386,9.383176,1.324978,5.238271,4.032677,8.322964,9.121049,0.639077,0.319521
std,74383.932294,5.974305,135.947152,111.985965,173773.9,146199.3,102.488268,112.762131,185731300.0,2412372.0,...,0.053856,0.045771,10.829706,0.9923,8.160822,5.831515,11.120754,10.874752,0.480269,0.466292
min,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,64418.0,8e-06,2.0,0.0,114.0,0.0,62.0,0.0,12318.0,0.0,...,0.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0
50%,128836.0,0.004285,4.0,2.0,528.0,178.0,254.0,29.0,743942.3,1747.441,...,0.0,0.0,5.0,1.0,1.0,1.0,3.0,4.0,1.0,0.0
75%,193254.0,0.685777,12.0,10.0,1362.0,1064.0,254.0,252.0,80000000.0,22105.38,...,0.036842,0.044665,12.0,2.0,4.0,3.0,8.0,11.0,1.0,1.0
max,257672.0,59.999989,10646.0,11018.0,14355770.0,14657530.0,255.0,254.0,5988000000.0,22422730.0,...,3.226788,2.928778,63.0,6.0,59.0,46.0,65.0,62.0,1.0,1.0


time: 348 ms


# Encode variables

## Continuous to bins

In [6]:
bin_num = 10

quantiles = cudf.DataFrame()

for i in range(1, bin_num):
    quant = i / 10
    print(f'Quantile: {quant}')
    
    quantiles['q' + str(i)] = df.quantile(q=quant)

quantiles = quantiles.dropna().reset_index()
quantiles_list = quantiles.to_pandas().to_dict('records')

Quantile: 0.1
Quantile: 0.2
Quantile: 0.3
Quantile: 0.4
Quantile: 0.5
Quantile: 0.6
Quantile: 0.7
Quantile: 0.8
Quantile: 0.9
time: 2.4 s


In [7]:
quantiles = []

for q in quantiles_list:
    if q['index'] not in ['id', 'label', 'test']:
        elements = list(q.items())
        quants = sorted(list(set([0.0] + [e[1] for e in elements[1:]])))
        quantiles.append((elements[0][1], quants))

time: 1.69 ms


In [8]:
def encode_quantiles(df, quantiles):
    temp_df = cudf.DataFrame()
    
    for q in quantiles:
        col_name = q[0]
        bins = q[1]

        if df[col_name].dtype == 'int64':
            bins = [int(e) for e in bins]

        temp_df[col_name + '_bin'] = df[col_name].digitize(np.array(bins))
    return temp_df

carry_over_cols = ['proto', 'service', 'state', 'attack_cat', 'label', 'test']
df_binned = encode_quantiles(df, quantiles)

for col in carry_over_cols:
    df_binned[col] = df[col]

df_binned = df_binned.reset_index().rename({'index': 'id'})
# del df
df_binned.head()

Unnamed: 0,id,dur_bin,spkts_bin,dpkts_bin,sbytes_bin,dbytes_bin,sttl_bin,dttl_bin,sload_bin,dload_bin,...,ct_src_dport_ltm_bin,ct_dst_sport_ltm_bin,ct_dst_src_ltm_bin,ct_srv_dst_bin,proto,service,state,attack_cat,label,test
0,0,6,3,2,4,1,3,3,3,3,...,2,2,2,2,tcp,-,FIN,Normal,0,0
1,1,8,4,6,5,6,3,3,2,5,...,2,2,3,6,tcp,-,FIN,Normal,0,0
2,2,10,3,5,4,6,3,3,1,4,...,2,2,4,6,tcp,-,FIN,Normal,0,0
3,3,10,4,4,5,4,3,3,1,2,...,2,2,4,2,tcp,ftp,FIN,Normal,0,0
4,4,7,4,3,5,2,4,3,2,3,...,3,2,8,9,tcp,-,FIN,Normal,0,0


time: 155 ms


## Categorical to index

In [9]:
import cuml

encoders = {}
cols_to_encode = ['proto', 'service', 'state']

for col in cols_to_encode:
    le = cuml.preprocessing.LabelEncoder()
    df_binned[col] = df_binned[col].astype('category')
    df_binned[col + '_enc'] = le.fit_transform(df_binned[col])
    
    encoders[col] = le
    
cols_reordered = [c for c in df_binned.columns if c not in cols_to_encode + ['label']] + ['label']
df_binned = df_binned[cols_reordered]
df_binned.columns

Index(['id', 'dur_bin', 'spkts_bin', 'dpkts_bin', 'sbytes_bin', 'dbytes_bin',
       'sttl_bin', 'dttl_bin', 'sload_bin', 'dload_bin', 'sloss_bin',
       'dloss_bin', 'sjit_bin', 'djit_bin', 'swin_bin', 'stcpb_bin',
       'dtcpb_bin', 'dwin_bin', 'tcprtt_bin', 'synack_bin', 'ackdat_bin',
       'ct_srv_src_bin', 'ct_state_ttl_bin', 'ct_src_dport_ltm_bin',
       'ct_dst_sport_ltm_bin', 'ct_dst_src_ltm_bin', 'ct_srv_dst_bin',
       'attack_cat', 'test', 'proto_enc', 'service_enc', 'state_enc', 'label'],
      dtype='object')

time: 191 ms


# Explode to COO format

In [10]:
### Attack_cat encoding
attack_cat = [
      (0, 'Normal')
    , (1, 'Reconnaissance')
    , (2, 'Shellcode')
    , (3, 'Analysis')
    , (4, 'Backdoor')
    , (5, 'DoS')
    , (6, 'Exploits')
    , (7, 'Generic')
    , (8, 'Fuzzers')
    , (9, 'Worms')
]

attack_cat = {
      'attack_id':  [e[0] for e in attack_cat]
    , 'attack_cat': [e[1] for e in attack_cat]
}

attack_categories = cudf.DataFrame(attack_cat)
df_binned = df_binned.merge(attack_categories, on='attack_cat')
df_binned = df_binned.drop('attack_cat')
df_binned.head()

Unnamed: 0,id,dur_bin,spkts_bin,dpkts_bin,sbytes_bin,dbytes_bin,sttl_bin,dttl_bin,sload_bin,dload_bin,...,ct_src_dport_ltm_bin,ct_dst_sport_ltm_bin,ct_dst_src_ltm_bin,ct_srv_dst_bin,test,proto_enc,service_enc,state_enc,label,attack_id
0,48384,10,5,4,7,3,4,3,1,1,...,2,2,4,4,0,113,0,4,1,8
1,48385,9,4,4,6,3,4,3,1,2,...,3,2,4,4,0,113,0,4,1,8
2,48386,10,5,4,8,3,4,3,1,1,...,3,3,5,5,0,113,0,4,1,8
3,48387,7,4,3,5,2,4,3,2,3,...,4,3,5,5,0,113,0,4,1,8
4,48388,10,4,4,6,4,4,3,1,2,...,3,2,3,3,0,113,3,4,1,8


time: 146 ms


In [11]:
training_df = df_binned.query('test == 0')
training_df = training_df.drop(['test', 'id']).reset_index().rename({'index': 'id'})

# training_df.head()
df_exploded = cudf.melt(training_df, id_vars=['id', 'label', 'attack_id']).sort_values('id').reset_index(drop=True)
df_exploded['variable'] = df_exploded['variable'].astype('str')
df_exploded['value'] = df_exploded['value'].astype('str')
df_exploded['feature'] = df_exploded['variable'] + '=' + df_exploded['value']
df_exploded.drop(['variable', 'value'])

feature_encoding = df_exploded['feature'].unique().reset_index()

time: 438 ms


In [12]:
len(df_binned), len(training_df)

(257673, 175341)

time: 1.45 ms


In [13]:
feature_encoding['index'] = feature_encoding['index'].astype('int16')
feature_encoding = feature_encoding.rename({'index': 'feature_enc'})
feature_encoding.head()

Unnamed: 0,feature_enc,feature
0,0,ackdat_bin=1
1,1,ackdat_bin=2
2,2,ackdat_bin=3
3,3,ackdat_bin=4
4,4,ackdat_bin=5


time: 16.1 ms


In [14]:
df_exploded = df_exploded.merge(feature_encoding, on='feature')[['id', 'feature_enc', 'label', 'attack_id']]
df_exploded.head()

Unnamed: 0,id,feature_enc,label,attack_id
0,592,87,0,0
1,592,294,0,0
2,592,289,0,0
3,592,0,0,0
4,592,34,0,0


time: 273 ms


# Finding frequent patterns

In [15]:
def mine_patterns(df_coo, df_binned, min_attack_rate=0.75, min_feature_count=200, max_iter=-1):
    def return_rank(frate, counts, rank):
        for i, (f, c) in enumerate(zip(frate, counts)):
            rank[i] = math.log(float(c)) * f
            
    features = df_coo['feature_enc'].unique().to_frame()
    df_coo = df_coo.merge(features, on='feature_enc')

    #### FIND FREQUENT ITEMS
    freq_items = df_coo.groupby(['feature_enc']).agg({'id': 'count', 'label': 'sum'}).reset_index()
    freq_items['attack_rate'] = freq_items['label'] / freq_items['id']

    freq_items = freq_items.apply_rows(
        return_rank
        , incols = {'label': 'counts', 'attack_rate': 'frate'}
        , outcols = {'rank': np.float64}
        , kwargs = {}
    ) 

    freq_items = freq_items.sort_values('rank', ascending=False)
    freq_items.head(5)

    freq_items = freq_items.query(f'attack_rate >= {min_attack_rate} and label > {min_feature_count}')

    if max_iter == -1:
        max_iter = len(freq_items)

    devices_checked = {}

    patterns = []
    stats    = []

    features_ordered = list(freq_items['feature_enc'].to_array())

    for i in range(max_iter):
        feature = features_ordered[i]

        ### get all the ids
        ids = df_coo.query('feature_enc == @feature')['id'].unique().to_frame()
        h = ids.hash_columns(['id']).sum()

        if h not in devices_checked:    
            count_ids = len(ids)
            devices_checked[h] = 1

            ### OUTPUT PATTERN
            all_features = df_coo.merge(ids, on='id').groupby('feature_enc').agg({'label': 'count'}).query('label == @count_ids').reset_index()
            all_features['pattern_id'] = i

            patterns.append(all_features[['pattern_id', 'feature_enc']])

            ### OUTPUT STATS
            ids = ids.merge(df_binned, on='id')
            ids['pattern_id'] = i
            ids = ids.groupby('pattern_id').agg({'id': 'count', 'label': 'sum'})
            ids = ids.rename({'id': 'packet_count', 'label': 'attack_count'})
            ids['attack_rate'] = ids['attack_count'] / ids['packet_count']
            ids['feature_cnt'] = all_features['feature_enc'].count()
            ids = ids.reset_index()
            stats.append(ids)
            
    patterns = cudf.concat(patterns).merge(features, on='feature_enc').sort_values(by='pattern_id')
    stats = cudf.concat(stats)
    
    return patterns, stats

time: 10.6 ms


# Testing

In [16]:
testing_binned = df_binned.query('test == 1')

time: 233 ms


## Pattern encoding

In [17]:
def encode_patterns(binned_df, patterns_to_encode):
    subset = binned_df[['id', 'label']]
    subset['pred'] = 0

    for i, pattern in enumerate(patterns_to_encode):
        q = ' and '.join([' == '.join(e) for e in pattern])
        temp = binned_df.query(q)['id'].to_frame()
        temp['enc'] = 1

        subset = subset.merge(temp, on=['id'], how='left')
        subset = subset.rename({'enc': 'col_' + str(i)})
        subset['col_' + str(i)] = subset['col_' + str(i)].fillna(0)
        subset['col_' + str(i)] = subset['col_' + str(i)].astype('float32')
        subset['pred'] = subset['pred'] + subset['col_'+str(i)]
        

    subset['pred'] = subset['pred'] > 0
    subset['pred'] = subset['pred'].astype('int8')
    
    return subset

time: 1.86 ms


# End-2-end pattern mining

In [18]:
def calculate_metrics(df):
    ttl = df['id'].sum()
    accuracy = df.query('(label == 0 and pred == 0) or (label == 1 and pred == 1)')['id'].sum() / ttl
    
    fp = df.query('(label == 0 and pred == 1)')['id'].sum()
    tn = df.query('(label == 0 and pred == 0)')['id'].sum()
    fn = df.query('(label == 1 and pred == 0)')['id'].sum()
    tp = df.query('(label == 1 and pred == 1)')['id'].sum()
    
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    
    return fp, tn, fn, tp, accuracy, fpr, fnr, (fpr + fnr) / 2

def encode_patterns(binned_df, patterns_to_encode):
    subset = binned_df[['id', 'label']]
    subset['pred'] = 0

    for i, pattern in enumerate(patterns_to_encode):
        q = ' and '.join([' == '.join(e) for e in pattern])
        temp = binned_df.query(q)['id'].to_frame()
        temp['enc'] = 1

        subset = subset.merge(temp, on=['id'], how='left')
        subset = subset.rename({'enc': 'col_' + str(i)})
        subset['col_' + str(i)] = subset['col_' + str(i)].fillna(0)
        subset['col_' + str(i)] = subset['col_' + str(i)].astype('float32')
        subset['pred'] = subset['pred'] + subset['col_'+str(i)]
        

    subset['pred'] = subset['pred'] > 0
    subset['pred'] = subset['pred'].astype('int8')
    
    return subset

def e2e_pattern_mining(label, df, training_df, testing_df, feature_encoding, results, min_attack_rate=.5, min_feature_count=200):
    print('[{0}] Mining features...'.format(label))
    patterns, stats = mine_patterns(df, training_df, min_attack_rate=min_attack_rate, min_feature_count=min_feature_count)
    patterns['label'] = label
    stats['label'] = label
    
    print('[{0}] Encoding patterns...'.format(label))
    patterns_rec = patterns.merge(feature_encoding, on='feature_enc')
    
    patterns_rec['col_name'] = patterns_rec['feature'].str.split('=')[0]
    patterns_rec['col_val']  = patterns_rec['feature'].str.split('=')[1]
    patterns_rec = patterns_rec.sort_values(by='pattern_id').reset_index(drop=True)
    patterns_expl = patterns_rec[['pattern_id', 'col_name', 'col_val']].to_pandas().to_records()

    patterns_to_encode = []

    curr = 0
    temp = []
    for i in patterns_expl:
        if curr == i[1]:
            temp.append((i[2], i[3]))

        else:
            patterns_to_encode.append(temp)
            temp = [(i[2], i[3])]
        curr = i[1]

    encoded_testing = encode_patterns(testing_df, patterns_to_encode)
    encoded_testing['id'] = encoded_testing['id'].astype('float32')

    #### Encoding training dataset
    encoded_df = encode_patterns(training_df, patterns_to_encode)
    X = encoded_df[['col_' + str(i) for i in range(len(patterns_to_encode))]]
    y = encoded_df['label'].astype('int32')
        
    #### Binned data
    X_binned = training_df[[col for col in training_df.columns if col not in ['id', 'label', 'attack_id']]]

    for col in X_binned:
        X_binned[col] = X_binned[col].astype('float32')

    y_binned = training_df['label'].astype('float32')
    
    X_testing_binned = testing_df[[col for col in testing_df.columns if col not in ['id', 'test', 'label', 'attack_id']]]
    for col in X_testing_binned:
        X_testing_binned[col] = X_testing_binned[col].astype('float32')
        
    y_testing_binned = testing_df[['id', 'label']]

   
    print('[{0}] Building models'.format(label))
    #####################################
    #### SIMPLE ENCODING
    #####################################
    pred = encoded_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
    simple_results = calculate_metrics(pred)
    
    row_to_insert = {
        'label': label
        , 'model': 'simple'
        , 'fp': simple_results[0]
        , 'tn': simple_results[1]
        , 'fn': simple_results[2]
        , 'tp': simple_results[3]
        , 'accuracy': simple_results[4]
        , 'fpr': simple_results[5]
        , 'fnr': simple_results[6]
        , 'far': simple_results[7]
    }
    
    simple_results_df = cudf.DataFrame(row_to_insert)
    
    #####################################
    #### RANDOM FOREST
    #####################################
    print('[{0}]\tRandom Forest'.format(label))
    rf = cuRFC(max_features=1.0, n_estimators=100, n_bins=10)
    rf.fit(X, y)

    encoded_testing = encoded_testing.drop('pred')
    encoded_testing['pred'] = rf.predict(encoded_testing[['col_' + str(i) for i in range(len(patterns_to_encode))]])
    rf_conf = encoded_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
    rf_results = calculate_metrics(rf_conf)
    
    row_to_insert = {
        'label': label
        , 'model': 'random forest'
        , 'fp': rf_results[0]
        , 'tn': rf_results[1]
        , 'fn': rf_results[2]
        , 'tp': rf_results[3]
        , 'accuracy': rf_results[4]
        , 'fpr': rf_results[5]
        , 'fnr': rf_results[6]
        , 'far': rf_results[7]
    }

    rf_results_df = cudf.DataFrame(row_to_insert)
    
    #####################################
    #### SUPPORT VECTOR MACHINES
    #####################################
    print('[{0}]\tSupport Vector Machines'.format(label))
    svc = SVC()
    svc.fit(X, y)

    encoded_testing = encoded_testing#.drop('pred')
    encoded_testing['pred'] = svc.predict(encoded_testing[['col_' + str(i) for i in range(len(patterns_to_encode))]])
    svc_conf = encoded_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
    svc_results = calculate_metrics(svc_conf)
    
    row_to_insert = {
        'label': label
        , 'model': 'SVC'
        , 'fp': svc_results[0]
        , 'tn': svc_results[1]
        , 'fn': svc_results[2]
        , 'tp': svc_results[3]
        , 'accuracy': svc_results[4]
        , 'fpr': svc_results[5]
        , 'fnr': svc_results[6]
        , 'far': svc_results[7]
    }
    svc_results_df = cudf.DataFrame(row_to_insert)
    results = cudf.concat([simple_results_df, rf_results_df, svc_results_df])

    return patterns, stats, results

time: 8.34 ms


In [52]:
results  = cudf.DataFrame()
patterns = cudf.DataFrame()
stats    = cudf.DataFrame()

#### overall label
p, s, r = e2e_pattern_mining('0/1 Overall Label', df_exploded, training_df, testing_binned, feature_encoding, results, min_attack_rate=.85)
results  = cudf.concat([results,  r])
patterns = cudf.concat([patterns, p])
stats    = cudf.concat([stats,    s])

results

[0/1 Overall Label] Mining features...
[0/1 Overall Label] Encoding patterns...
[0/1 Overall Label] Building models
[0/1 Overall Label]	Random Forest
[0/1 Overall Label]	Support Vector Machines


Unnamed: 0,label,model,fp,tn,fn,tp,accuracy,fpr,fnr,far
0,0/1 Overall Label,simple,17760,19240,5787,39545,0.713999,0.48,0.127658,0.303829
0,0/1 Overall Label,random forest,6809,30191,6995,38337,0.832337,0.184027,0.154306,0.169167
0,0/1 Overall Label,SVC,6759,30241,6693,38639,0.836613,0.182676,0.147644,0.16516


time: 17 s


In [53]:
def redefine_label(attack_id, new_label, attack_select):
    for i, ai in enumerate(attack_id):
        new_label[i] = 1 if ai == attack_select else 0
        

for i, ac in list(zip(attack_cat['attack_id'], attack_cat['attack_cat']))[1:]:     
    training_df_new = training_df.apply_rows(
        redefine_label
        , incols  = ['attack_id']
        , outcols = {'new_label': np.int32}
        , kwargs  = {'attack_select': i}
    )

    testing_binned_new = testing_binned.apply_rows(
        redefine_label
        , incols  = ['attack_id']
        , outcols = {'new_label': np.int32}
        , kwargs  = {'attack_select': i}
    )

    training_df_new = training_df_new.drop('label').rename({'new_label': 'label'})
    testing_binned_new = testing_binned_new.drop('label').rename({'new_label': 'label'})

    p, s, r = e2e_pattern_mining(ac + ' (full)', df_exploded, training_df_new, testing_binned_new, feature_encoding, results, min_attack_rate=.85)
    
    results  = cudf.concat([results,  r])
    patterns = cudf.concat([patterns, p])
    stats    = cudf.concat([stats,    s])
    
# results

[Reconnaissance (full)] Mining features...
[Reconnaissance (full)] Encoding patterns...
[Reconnaissance (full)] Building models
[Reconnaissance (full)]	Random Forest
[Reconnaissance (full)]	Support Vector Machines
[Shellcode (full)] Mining features...
[Shellcode (full)] Encoding patterns...
[Shellcode (full)] Building models
[Shellcode (full)]	Random Forest
[Shellcode (full)]	Support Vector Machines
[Analysis (full)] Mining features...
[Analysis (full)] Encoding patterns...
[Analysis (full)] Building models
[Analysis (full)]	Random Forest
[Analysis (full)]	Support Vector Machines
[Backdoor (full)] Mining features...
[Backdoor (full)] Encoding patterns...
[Backdoor (full)] Building models
[Backdoor (full)]	Random Forest
[Backdoor (full)]	Support Vector Machines
[DoS (full)] Mining features...
[DoS (full)] Encoding patterns...
[DoS (full)] Building models
[DoS (full)]	Random Forest
[DoS (full)]	Support Vector Machines
[Exploits (full)] Mining features...
[Exploits (full)] Encoding patter

In [54]:
for i, ac in list(zip(attack_cat['attack_id'], attack_cat['attack_cat']))[1:]:     
#     print(i, ac)
    training_df_new = training_df.query('attack_id == 0 or attack_id == @i')
    testing_binned_new = testing_binned.query('attack_id == 0 or attack_id == @i')

    p, s, r = e2e_pattern_mining(ac + ' (limited)', df_exploded, training_df_new, testing_binned_new, feature_encoding, results, min_attack_rate=.85)
    
    results  = cudf.concat([results,  r])
    patterns = cudf.concat([patterns, p])
    stats    = cudf.concat([stats,    s])
    
results

[Reconnaissance (limited)] Mining features...
[Reconnaissance (limited)] Encoding patterns...
[Reconnaissance (limited)] Building models
[Reconnaissance (limited)]	Random Forest
[Reconnaissance (limited)]	Support Vector Machines
[Shellcode (limited)] Mining features...
[Shellcode (limited)] Encoding patterns...
[Shellcode (limited)] Building models
[Shellcode (limited)]	Random Forest
[Shellcode (limited)]	Support Vector Machines
[Analysis (limited)] Mining features...
[Analysis (limited)] Encoding patterns...
[Analysis (limited)] Building models
[Analysis (limited)]	Random Forest
[Analysis (limited)]	Support Vector Machines
[Backdoor (limited)] Mining features...
[Backdoor (limited)] Encoding patterns...
[Backdoor (limited)] Building models
[Backdoor (limited)]	Random Forest
[Backdoor (limited)]	Support Vector Machines
[DoS (limited)] Mining features...
[DoS (limited)] Encoding patterns...
[DoS (limited)] Building models
[DoS (limited)]	Random Forest
[DoS (limited)]	Support Vector Mach

Unnamed: 0,label,model,fp,tn,fn,tp,accuracy,fpr,fnr,far
0,0/1 Overall Label,simple,17760,19240,5787,39545,0.713999,0.48,0.127658,0.303829
0,0/1 Overall Label,random forest,6809,30191,6995,38337,0.832337,0.184027,0.154306,0.169167
0,0/1 Overall Label,SVC,6759,30241,6693,38639,0.836613,0.182676,0.147644,0.16516
0,Reconnaissance (full),simple,54367,24469,558,2938,0.332884,0.689621,0.159611,0.424616
0,Reconnaissance (full),random forest,204,78632,2761,735,0.963987,0.002588,0.78976,0.396174
0,Reconnaissance (full),SVC,197,78639,2811,685,0.963465,0.002499,0.804062,0.40328
0,Shellcode (full),simple,56988,24966,61,317,0.307086,0.695366,0.161376,0.428371
0,Shellcode (full),random forest,180,81774,365,13,0.99338,0.002196,0.965608,0.483902
0,Shellcode (full),SVC,0,81954,378,0,0.995409,0.0,1.0,0.5
0,Analysis (full),simple,56669,24986,41,636,0.311203,0.694005,0.060561,0.377283


time: 1min 22s


## Results -- simple encoding

In [50]:
def to_list(x):
    return list(x)

def ranges(bin_no, bins):
    if bin_no == len(bins):
        return '>{0:f}'.format(bins[-1])
    else:
        return '<{0:,f}, {1:,f})'.format(bins[bin_no-1], bins[bin_no])

quantile_bins = pd.DataFrame(quantiles, columns=['feat', 'bins'])
patterns_rec = patterns.merge(feature_encoding, on='feature_enc')

patterns_host = patterns_rec[['pattern_id', 'feature', 'label']].to_pandas()
patterns_host['feat']    = patterns_host.apply(lambda row: row['feature'].split('='), axis = 1)
patterns_host['bin']     = patterns_host.apply(lambda row: row['feat'][1], axis = 1)
patterns_host['feat']    = patterns_host.apply(lambda row: row['feat'][0][:-4], axis = 1)
patterns_host            = patterns_host.merge(quantile_bins, on=['feat'])
patterns_host['ranges']  = patterns_host.apply(lambda row: ranges(int(row['bin']), row['bins']), axis=1)
patterns_host['feature'] = patterns_host['feat'] + '=' + patterns_host['ranges']
patterns_host            = patterns_host[['label', 'pattern_id', 'feature']].sort_values(by='pattern_id')

patterns_host_agg               = patterns_host.groupby(['label', 'feature']).agg({'pattern_id': to_list}).reset_index()
patterns_host_agg['pattern_id'] = patterns_host_agg.apply(lambda row: ','.join([str(e) for e in row['pattern_id']]), axis = 1)#
patterns_host_agg               = patterns_host_agg.groupby(['label', 'pattern_id']).agg({'feature': to_list}).reset_index()

# patterns_host_agg.to_dict('records')

# quantile_bins

time: 976 ms


In [55]:
stats


Unnamed: 0,pattern_id,packet_count,attack_count,attack_rate,feature_cnt,label,index
0,0,82275,76560,0.930538,15,0/1 Overall Label,
0,1,84290,77626,0.920940,11,0/1 Overall Label,
0,2,84372,77626,0.920045,9,0/1 Overall Label,
0,3,85195,77807,0.913281,1,0/1 Overall Label,
0,4,89080,80801,0.907061,1,0/1 Overall Label,
0,5,42004,39510,0.940625,8,0/1 Overall Label,
0,6,88187,77873,0.883044,1,0/1 Overall Label,
0,7,22427,22261,0.992598,19,0/1 Overall Label,
0,8,23114,22790,0.985983,15,0/1 Overall Label,
0,9,23031,22564,0.979723,14,0/1 Overall Label,


time: 151 ms


## Logistic Regression model

In [32]:
reg = LogisticRegression(fit_intercept=False, C=0.1)
reg.fit(X, y)

encoded_testing = encoded_testing.drop('pred')
encoded_testing['pred'] = reg.predict(encoded_testing[['col_' + str(i) for i in range(len(patterns_to_encode))]])
results = encoded_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
results
calculate_metrics(results)

(0.7139994169946072, 0.30382908320832963)

time: 772 ms


## Support Vector Machines

In [33]:
reg = SVC()
reg.fit(X, y)

encoded_testing = encoded_testing.drop('pred')
encoded_testing['pred'] = reg.predict(encoded_testing[['col_' + str(i) for i in range(len(patterns_to_encode))]])
results = encoded_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
results
calculate_metrics(results)

(0.8366127386678327, 0.16515986201501953)

time: 5.77 s


# Binned features

In [44]:
# training_df.head()
X = training_df[[col for col in training_df.columns if col not in ['id', 'label', 'attack_id']]]

for col in X:
    X[col] = X[col].astype('float32')

y = training_df['label'].astype('float32')
X_testing = testing_binned[[col for col in testing_binned.columns if col not in ['id', 'test', 'label', 'attack_id']]]
y_testing = testing_binned[['id', 'label']]

reg = SVC()
reg.fit(X, y)

SVC(handle=<cuml.common.handle.Handle object at 0x7f6b004e16c0>, C=1, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, cache_size=200.0, max_iter=-1, nochange_steps=1000, verbose=False)

time: 2.24 s


In [46]:
X.columns

Index(['dur_bin', 'spkts_bin', 'dpkts_bin', 'sbytes_bin', 'dbytes_bin',
       'sttl_bin', 'dttl_bin', 'sload_bin', 'dload_bin', 'sloss_bin',
       'dloss_bin', 'sjit_bin', 'djit_bin', 'swin_bin', 'stcpb_bin',
       'dtcpb_bin', 'dwin_bin', 'tcprtt_bin', 'synack_bin', 'ackdat_bin',
       'ct_srv_src_bin', 'ct_state_ttl_bin', 'ct_src_dport_ltm_bin',
       'ct_dst_sport_ltm_bin', 'ct_dst_src_ltm_bin', 'ct_srv_dst_bin',
       'proto_enc', 'service_enc', 'state_enc'],
      dtype='object')

time: 3.46 ms


In [52]:


for col in X_testing:
    X_testing[col] = X_testing[col].astype('float32')
    
# X_testing.columns
y_testing['pred'] = reg.predict(X_testing)
results = y_testing.groupby(['label', 'pred']).agg({'id': 'count'}).reset_index()
results
calculate_metrics(results)

(0.8577466841568284, 0.15639750215228906)

time: 139 ms


In [71]:
encoded_df.groupby('col_0').agg({'id': 'count', 'label': 'sum'})

Unnamed: 0_level_0,id,label
col_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2259581,251932.0
1.0,280466,69351.0


time: 136 ms


In [104]:
sub = df_binned.query('ct_dst_sport_ltm_bin == 4')[['id', 'label']]
sub['pred'] = 1

sub['label'].sum(),sub['id'].count()
# sub

(22440, 22469)

time: 182 ms


In [98]:
fff = df_binned.merge(sub, on='id', how='left')#.head()
fff['pred'] = fff['pred'].fillna(0)

fff.groupby('pred').agg({'id': 'count', 'label_x': 'sum'})

Unnamed: 0_level_0,id,label_x
pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2259581,59031
1,280466,262252


time: 69.4 ms
