In [172]:
import numpy
import keras
import pandas
import requests
import io
import zipfile
import os
import re

from keras.models import Sequential
from keras.layers import Dense

numpy.random.seed(0xC0FFEE)

# Dataset

## Loading data

Run the below code to download a copy of the dataset (if you don't already have it):

In [173]:
response = requests.get("http://www.schonlau.net/masquerade/masquerade-data.zip")

dataset_file = io.BytesIO(response.content)

zipped_dataset = zipfile.ZipFile(dataset_file)
zipped_dataset.extractall('data/masquerade-data')

In [174]:
# http://www.schonlau.net/intrusion.html
# download Masquerade Data (zip File)

import pandas as pd
directory = './data/masquerade-data'

In [175]:
def sorted_nicely( l ):
    """ Sorts the given iterable in the way that is expected.
 
    Required arguments:
    l -- The iterable to be sorted.
 
    """
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key = alphanum_key)

In [176]:
users = range(1,51)
df = pd.DataFrame()

for filename in sorted_nicely(os.listdir(directory)):
    user = pd.read_csv(os.path.join(directory, filename), header=None)
    df = pd.concat([df, user], axis = 1)
    
df.columns = sorted_nicely(os.listdir(directory))

We've loaded in the dataset, but need to do a little co-ercion to get it how we need. Firstly, make sure that all the values in this dataframe are categorical variables which share the same data type:

In [177]:
commands = numpy.unique(df)
command_dtype = pandas.api.types.CategoricalDtype(commands)

for column in df:
    df[column] = df[column].astype(command_dtype)

In [178]:
train, test = df.head(5000), df.tail(len(df) - 5000)

Plan is convert to the following format:

  user, command1?, command2?, ..., 
  
 so the first column is a label, and the second a one-hot encoding of the command.
 
 When we do the rolling window aggregation, we just sum the columns (per-user).

In [179]:
def rolling_window_command_counts(commands, window_size):
    
    # Save a copy the name of the series to add again to our output. This will preserve the mapping of
    # user identifier to (it's column header in the dataframe it came from), which in
    # this case is the user identifier. 
    user = commands.name

    # Convert the single column "which command was run?" to a column for each
    # command, which says "was command <x> run?"
    commands = pandas.get_dummies(commands)

    # Take a rolling sample of the last 100 commands, then sum each "was command <x> run?"
    # columns to give a bunch "command <x> was run <y> times in this window".
    command_counts = commands.rolling(window=window_size).aggregate(numpy.sum)

    # Remove the first 100 rows because they contain data from blocks of size < 100.
    command_counts = command_counts[window_size-1:]
    
    # Preserve the user identifier (see top of function) as a new column:
    
    # First, a nasty hack: https://github.com/pandas-dev/pandas/issues/19136
    command_counts = command_counts.rename(columns=str)  
    
    # Then, add in the user (with an adhoc parser to turn the label into a number)
    command_counts['user'] = int(user.replace('User', ''))

    return command_counts

# Example
rolling_window_command_counts(train['User1'], 100)

Unnamed: 0,%backup%,.java_wr,.maker_w,.wrapper,.xinitrc,.xsessio,1.1,1.2,1.3,4Dwm,...,xxx,yacc,ypcat,yppasswd,z,zip,zsh,zubs,zz2,user
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [180]:
labelled_training_dataset = pandas.concat([
        rolling_window_command_counts(commands, 100)
        for user, commands in train.iteritems()
    ],
    ignore_index=True,  # reset index to go from 0 to 4900
)

labelled_training_dataset

Unnamed: 0,%backup%,.java_wr,.maker_w,.wrapper,.xinitrc,.xsessio,1.1,1.2,1.3,4Dwm,...,xxx,yacc,ypcat,yppasswd,z,zip,zsh,zubs,zz2,user
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


## Sample Raw Dataset

Use [rolling window sampling](https://pcp.io/books/PCP_PG/html/LE42586-PARENT.html).


In [181]:
training_labels = labelled_training_dataset['user'] - 1
training_dataset = labelled_training_dataset.drop(columns=['user'])

In [182]:
training_labels =  keras.utils.to_categorical(training_labels, num_classes=50)

In [183]:
training_dataset.sample(20)

Unnamed: 0,%backup%,.java_wr,.maker_w,.wrapper,.xinitrc,.xsessio,1.1,1.2,1.3,4Dwm,...,xwsh,xxx,yacc,ypcat,yppasswd,z,zip,zsh,zubs,zz2
225704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152022,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14656,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200636,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Building the Oracle

In [184]:
oracle = Sequential()

In [185]:
input_layer = Dense(
    units=856,
    activation='relu',
    input_dim=856,
)

In [186]:
hidden_layer = Dense(
    units=30,
    activation='relu',
)

In [187]:
output_layer = Dense(
    units=50,
    activation='softmax',
)

In [188]:
oracle.add(input_layer)
oracle.add(hidden_layer)
oracle.add(output_layer)

In [189]:
oracle.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

# Training Oracle on Dataset

In [190]:
oracle.fit(training_dataset,  training_labels, epochs=3, batch_size=50)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0xb41b28fd0>

In [191]:
oracle

<keras.engine.sequential.Sequential at 0xb41b44ba8>

In [192]:
loss, accuracy = oracle.evaluate(training_dataset, training_labels)



In [193]:
loss, accuracy

(0.050821111680833926, 0.9819424607223016)

In [194]:
oracle.metrics_names

['loss', 'acc']

# Evaluation

In [195]:
response = requests.get("http://www.schonlau.net/masquerade/masquerade_summary.txt")
raw_test_labels = io.BytesIO(response.content)
test_labels = pandas.read_csv(raw_test_labels, sep=' ', names=range(50))

In [196]:
test_labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [197]:
test

Unnamed: 0,User1,User2,User3,User4,User5,User6,User7,User8,User9,User10,...,User41,User42,User43,User44,User45,User46,User47,User48,User49,User50
5000,java,flex,diff,sed,hostname,sh,id,ksh,xterm,m3_binin,...,java,chmod,dirname,sendmail,sendmail,detail_o,cat,netscape,egrep,toolches
5001,.java_wr,uname,diff,FIFO,stty,engine,nawk,ls,launchef,uname,...,java,ls,basename,sendmail,sh,netstat,cat,netscape,sh,cat
5002,expr,nawk,diff,cat,date,sh,getopt,du,sh,uname,...,telnet,ls,egrep,sendmail,netstat,netscape,cat,netscape,drag,mail
5003,expr,cpp,diff,date,echo,engine,true,ksh,launchef,m3_compt,...,mkpts,ls,egrep,sendmail,netscape,netscape,cat,netscape,ex,tcsh
5004,dirname,cc1,diff,generic,[,sh,true,popper,sh,cfe,...,hostname,mc,egrep,sendmail,netscape,test2.pl,cat,ls,echo,hostname
5005,basename,as,diff,generic,find,engine,grep,tar,launchef,ugen,...,stty,lc,egrep,sendmail,netscape,test.pl,cat,rlogin,drag,date
5006,egrep,gcc,diff,date,chmod,sh,date,du,sh,as1,...,.java_wr,mkdir,java,more,sendmail,ls,cat,rlogin,egrep,Mail
5007,egrep,gcc,diff,generic,echo,engine,lp,popper,launchef,driver,...,expr,ln,java,tput,sendmail,tcsh,cat,rlogin,drag,tcsh
5008,egrep,uname,diff,gethost,chmod,sh,find,popper,sh,comp_uni,...,expr,rm,ex,mail,sendmail,cat,cat,ls,egrep,cpp
5009,egrep,nawk,diff,download,sh,engine,tail,wc,sh,sh,...,dirname,rm,ex,mail,mailx,stream_t,cat,launchef,drag,sh


In [198]:
def block_command_counts(commands, window_size):
    
    # Save a copy the name of the series to add again to our output. This will preserve the mapping of
    # user identifier to (it's column header in the dataframe it came from), which in
    # this case is the user identifier. 
    user = commands.name

    # Convert the single column "which command was run?" to a column for each
    # command, which says "was command <x> run?"
    commands = pandas.get_dummies(commands)

    # Take a rolling sample of the last 100 commands, then sum each "was command <x> run?"
    # columns to give a bunch "command <x> was run <y> times in this window".
    command_counts = commands.groupby(commands.index // window_size).aggregate(numpy.sum)
    
    # Preserve the user identifier (see top of function) as a new column:
    
    # First, a nasty hack: https://github.com/pandas-dev/pandas/issues/19136
    command_counts = command_counts.rename(columns=str)  
    
    # Then, add in the user (with an adhoc parser to turn the label into a number)
    command_counts['user'] = int(user.replace('User', ''))

    return command_counts

# Example
block_command_counts(test['User1'], 100)

Unnamed: 0,%backup%,.java_wr,.maker_w,.wrapper,.xinitrc,.xsessio,1.1,1.2,1.3,4Dwm,...,xxx,yacc,ypcat,yppasswd,z,zip,zsh,zubs,zz2,user
50,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
51,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
52,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
54,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
55,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
56,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
57,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
59,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [199]:
testing_dataset = pandas.concat([
        block_command_counts(commands, 100)
        for user, commands in test.iteritems()
    ],
    ignore_index=True,  # reset index to go from 0 to 4900
)
testing_dataset

Unnamed: 0,%backup%,.java_wr,.maker_w,.wrapper,.xinitrc,.xsessio,1.1,1.2,1.3,4Dwm,...,xxx,yacc,ypcat,yppasswd,z,zip,zsh,zubs,zz2,user
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [200]:
prediction = oracle.predict(testing_dataset.drop('user', axis='columns'))

In [201]:
prediction[0]

array([9.9997902e-01, 5.5293268e-22, 2.6324307e-10, 4.3152699e-08,
       2.5420541e-14, 1.7924030e-11, 9.1651835e-08, 1.4340152e-17,
       4.6024243e-14, 1.8839184e-12, 2.0609345e-08, 4.3009797e-12,
       1.7440554e-08, 4.4797343e-14, 1.6510370e-08, 4.0581821e-07,
       2.1667522e-06, 2.4104031e-21, 3.4646398e-12, 1.4914013e-10,
       1.0336676e-06, 1.7060600e-09, 2.1609778e-12, 4.5409194e-20,
       4.4826795e-08, 2.5556060e-08, 9.8429178e-16, 1.0226358e-10,
       1.5672578e-15, 1.3948575e-15, 6.9221531e-11, 3.1501573e-13,
       1.6484153e-05, 1.1844208e-12, 3.7238676e-10, 5.7387799e-17,
       1.3516950e-07, 6.4255141e-16, 6.2834293e-14, 2.9350970e-11,
       6.1988704e-12, 1.8675892e-09, 2.6684936e-11, 1.0011266e-10,
       6.4464321e-16, 1.0426740e-14, 7.3056973e-20, 6.3417527e-10,
       6.2541693e-07, 1.5752483e-14], dtype=float32)

In [202]:
def predicted_user_probs(user_probs):
    for user_id, probability in enumerate(user_probs):
        if probability >= 0.5:
            return user_id
    return -1  # signals that no definite prediction was given

predicted_users = numpy.apply_along_axis(predicted_user_probs, 1, prediction)
predicted_users = pandas.Series(predicted_users)

figure out most likely user.
if above threshold: select
if not: reject

In [227]:
def predicted_user_probs(user_probs):
    most_likely_user, probability = max(enumerate(user_probs), key=lambda tup: tup[1])
    if probability >= 0.1:
        return most_likely_user
    return -1  # signals that no definite prediction was given

predicted_users = numpy.apply_along_axis(predicted_user_probs, 1, prediction)
predicted_users = pandas.Series(predicted_users)

In [230]:
user_account = testing_dataset['user']
user_account = user_account - 1
user_account = pandas.Series(user_account)

In [231]:
test_labels_arrays = []
for i in range(50):
    test_labels_arrays.append(test_labels[i])

test_labels_long = pandas.concat(test_labels_arrays, axis='rows', ignore_index=True)

In [232]:
df = pandas.concat(
    [user_account, predicted_users, test_labels_long], 
    axis='columns', 
)
df.columns = ['user_account', 'predicted_user', 'intruder?']

In [233]:
df

Unnamed: 0,user_account,predicted_user,intruder?
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,0,0,0
7,0,0,0
8,0,0,0
9,0,0,0


In [234]:
def calculate_intrusion_prediction(row):
    user_account, predicted_user, is_intruder = row
    
    if predicted_user == user_account:
        return 0
    else:
        return 1
        
df['our_pred'] = df.apply(calculate_intrusion_prediction, axis=1, raw=True)

In [235]:
df

Unnamed: 0,user_account,predicted_user,intruder?,our_pred
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0


In [236]:
def stats(row):
    user_account, predicted_user, is_intruder, our_pred = row
    
    if is_intruder == our_pred == 0:
        return 'tp'
    elif is_intruder == our_pred == 1:
        return 'tn'
    elif is_intruder == 0 and our_pred == 1:
        return 'fn'
    else:
        return 'fp'

df['classification'] = df.apply(stats, axis=1, raw=True)
df

Unnamed: 0,user_account,predicted_user,intruder?,our_pred,classification
0,0,0,0,0,tp
1,0,0,0,0,tp
2,0,0,0,0,tp
3,0,0,0,0,tp
4,0,0,0,0,tp
5,0,0,0,0,tp
6,0,0,0,0,tp
7,0,0,0,0,tp
8,0,0,0,0,tp
9,0,0,0,0,tp


In [237]:
stat = df.classification.value_counts()
stat

tp    3311
fn    1458
tn     227
fp       4
Name: classification, dtype: int64

write in the project about fn better fp

In [238]:
accuracy = (stat.tp + stat.tn)/(stat.tp + stat.tn + stat.fn + stat.fp)
accuracy

0.7076

# Surrogate