Index:
1. Load all data from database and save a dump

    1.1 Load from the database
    
    1.2 Dump the data
    
    1.3 Simple function to either load the dump or fetch from db


2. Build mouse models

    2.1 Add new models
    
    2.2 Combine new models
    
        2.2.1 Add new features:
            
            
    
    2.3 Report

### 1. Load the dataset

In [1]:
#Collect the data logs from the database
from pymongo import MongoClient
import pickle

In [2]:
def get_valid_participants(method='db', dump=False):
    if method == 'dump':
        with open('valid_participants', 'rb') as f:
            try:
                pickle.load(f)
            except:
                return "It seems you did't dump the dataset in the first place"
        return pickle.load(f)
        
    connection = MongoClient('mongodb://dave123:awesomeGuy@localhost:27017') #Yeah I can hear you ;)
    db = connection.passauth
    collection_names = db.collection_names()
    participants = [list(db[user].find()) for user in collection_names 
                        if all([True if 'mouseMeta' in session.keys() else False for session in list(db[user].find())])]
    valid_participants = [participant for participant in participants if len(participant) >= 25]

    if dump:
        with open('valid_participants', 'wb') as f:
            pickle.dump(valid_participants, f)
        
    return valid_participants

In [1]:
from pymongo import MongoClient
connection = MongoClient('localhost:27017')

In [10]:
attackers = []
for each in connection['passauth-attack'].collection_names():
    attackers.append([_ for _ in connection['passauth-attack'][each].find()])
        

  


In [19]:
with open('attackers', 'wb') as f:
    pickle.dump(attackers, f)

In [None]:
pickle.load()

In [3]:
valid_participants = get_valid_participants(dump=True)

  if sys.path[0] == '':


### 2. Mouse Analysis

In [4]:
import logging
import numpy as np
import os
import random
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
class User:
    def __init__(self, userid, mouseLogs, keyLogs):
        self.userid = userid
        self.mouseLogs = np.array(mouseLogs)
        self.keyLogs = np.array(keyLogs)
        
def pad_trim(data, size=100, padding_method='zeros'):
    """
    max_len: the size of the final list
    padding_method:
        'zeros': adds zeros (0) for missing numeric values
        'last': duplicates last value
        'interpolated': interpolatest the values
    """
   
    if len(data) == 0:
        pass
   
    elif len(data) > size:
        data = data[:size]
   
    else:
        if padding_method == 'zeros':
            padding = np.tile(np.array([0]*len(data[0])), reps=(size-len(data), 1))
            data = np.vstack([data, padding])
           
    return data

def pick_n(data, n):
    """
    pick only n-th rows
    """
    return data[::n]

flip = np.flip


In [6]:
users = []
for _, user in enumerate(valid_participants):
    mouseLogs = [np.array(session['mouseLogs'][0], dtype='int32') for session in user]
    keyLogs = [np.array(list(map(lambda x: x['duration'], session['passwordLogs']+session['usernameLogs'])), dtype='int32') 
               for session in user]
    if all([len(mouseLog) for mouseLog in mouseLogs]):
        users.append(User(_+1, mouseLogs, keyLogs))


In [7]:
def perf_measure(actual, prediction):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    assert len(actual) == len(prediction), "Array lengths do not match"
    for i in range(len(actual)): 
        if actual[i]==1:
            if prediction[i]==1:
                tp += 1
            else:
                fn += 1
        elif actual[i]==0:
            if prediction[i]==0:
                tn += 1
            if prediction[i]==1:
                fp += 1
                
    assert tp+fp+tn+fn == len(actual), "Class labels should be binary (1 or 0)"
    return(tp, fp, tn, fn)

def interpret_perf(tp, fp, tn, fn):
#     print("True Positive: ", tp)
#     print("False Positive: ", fp)
#     print("True Negative: ", tn)
#     print("False Negative: ", fn)
#     print("TPR: ", tp/(tp+fn))
    res = [fp/(fp+tn) if (fp+tn) != 0 else 0, #FAR
           fn/(tp+fn) if (tp+fn) !=0 else 0,
           (tp+tn)/(tp+tn+fp+fn) if (tp+tn+fp+fn) !=0 else 0]

    return res
        
def classifier_summary(actual, prediction):
    return interpret_perf(*perf_measure(actual, prediction))

In [8]:
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
data_logger = logging.getLogger('DATA')
learning_logger = logging.getLogger('LEARNING')

In [9]:
class MouseDataModel:
    def __init__(self, userid, users):
        self.userid = userid
        self.X_train = None
        self.Y_train = None
        self.X_test = None
        self.Y_test = None

        dataX = []
        dataY = []
        dataXother = []
        dataYother = []
        
        for user in users: #dataX is a collection of users
            if user.userid == self.userid:
                for mouseLog in user.mouseLogs:
                    dataX.append(mouseLog)
                    dataY.append(user.userid)
            else:
                for mouseLog in user.mouseLogs:
                    dataXother.append(mouseLog)
                    dataYother.append(user.userid)
                    
                    
        self.meanSessionLength = int(np.mean([len(x) for x in dataX]) + 
                                     np.std([len(x) for x in dataX]))
        
        
        self.dataX = np.array(dataX)
        self.dataY = np.array(dataY)
        self.dataXother = np.array(dataXother)
        self.dataYother = np.array(dataYother)
        
    def train_test_data(self, 
                        test_size=(0.20, 0.50), 
                        classes=0, 
                        binary_classes=False, 
                        instances=5, 
                        total_classes=None,
                        random=3):
        """
        Return and set training and test dataset for self
        
        Keyword Arguments:
        test_size (float, float): test size for splitting the dataset of self vs other classes.
        classes (int): Number of other user MouseDataModel(s) to consider.
        instances (int): Number of examples to take from the considered classes.
        
        """     
        
        X_train, X_test, Y_train, Y_test = \
            train_test_split(self.dataX, self.dataY, test_size=test_size[0], random_state=random)
        
        if classes == 0:
            self.X_train, self.X_test, self.Y_train, self.Y_test = \
                X_train, X_test, Y_train, Y_test
            
        elif classes > 0:
            assert instances <= 25, "Too many instances requested"
            other_classes = np.unique(self.dataYother[self.dataYother != self.userid])
            assert len(other_classes) >= classes, "Too many classes"
            
            np.random.shuffle(other_classes)
            other_classes = other_classes[:classes]
            data_logger.info("Selected classes "+str(other_classes))
            dataX = []
            dataY = []

            for each in other_classes:
                for log in self.dataXother[self.dataYother == each][:instances]:
                    dataX.append(log)
                    if binary_classes:
                        dataY.append(-1)
                    else:
                        dataY.append(each)

            X_train_other, X_test_other, Y_train_other, Y_test_other = \
                train_test_split(np.array(dataX), np.array(dataY), test_size=test_size[1], random_state=random)

            self.X_train = np.concatenate((X_train, X_train_other), axis=0)
            self.X_test = np.concatenate((X_test, X_test_other), axis=0)
            self.Y_train = np.concatenate((Y_train, Y_train_other), axis=0)
            self.Y_test = np.concatenate((Y_test, Y_test_other), axis=0)
        
            #Shuffle the data rows
            assert len(self.X_test) == len(self.Y_test)
            p = np.random.permutation(len(self.X_test))
            self.X_test = self.X_test[p]
            self.Y_test = self.Y_test[p]
            
        
        return self.X_train, self.X_test, self.Y_train, self.Y_test #other_classes



In [10]:
from copy import deepcopy

# Model for single user:
class MouseLearningModel:
    def __init__(self, X_train, X_test, Y_train, Y_test, meanMouseSessionLength=100):
        
        self.X_train = X_train
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_test = Y_test
        
        self.ensemble = None

        self.data_transformations = [
            lambda x: pad_trim(flip(x, axis=0), size=50)[:, 0:1].flatten(),
            lambda x: pad_trim(flip(x, axis=0), size=50)[:, 1:2].flatten(),
            lambda x: pad_trim(x, size=50)[:, 0:1].flatten(),
            lambda x: pad_trim(x, size=50)[:, 1:2].flatten(),
            lambda x: pad_trim(x[:5], size=50)[:, 0:2].flatten(),
            lambda x: pad_trim(x, size=meanMouseSessionLength)[:, 0:2].flatten(),
            lambda x: pad_trim(np.abs(x-x[0]))[:, 0:2].flatten(),
            lambda x: pad_trim(np.abs(np.multiply(x-x[0], np.log(x[:,2:]))))[:, 0:2].flatten(),
            #PCA
        ]

        self.learning_models = [
            {"algorithm": SVC,
             "params": [
                 {'kernel':'linear', 'gamma':'auto', 'degree':3, 'class_weight':'balanced', 'cache_size':10}
             ]},
            {"algorithm": AdaBoostClassifier, 
             "params": [
                 {'n_estimators':100},
             ]},
            {"algorithm": GaussianNB},
            {"algorithm": RandomForestClassifier,
             "params": [
                 {'n_estimators': 20},
             ]},
            {"algorithm": BernoulliNB},
            {"algorithm": MultinomialNB},
            {"algorithm": SGDClassifier},
            {"algorithm": MLPClassifier,
             "params": [
                 {'max_iter': 1000},
             ]},
        ]
        
        self.ideal_cutoff = None
        self.ideal_models = None
        

    def approximate_cutoffs(self, models, tolerance=0.1, iterations=7):
        
        ideal_cutoff = 0
        best_far = sorted(list(filter(lambda x: not x[-1][1] >0.9, models)), key=lambda x: x[-1][0])[0:7]
        best_frr = sorted(list(filter(lambda x: not x[-1][0] >0.5, models)), key=lambda x: x[-1][1])[0:5]
        self.ideal_models = best_far + best_frr
        ideal_cutoffs = []
        for i in range(iterations):
            res = self.cutoff_score(self.ideal_models) #is continuous 
            res = sorted(res, key= lambda x: x[1][-1])
            ideal_cutoffs = ideal_cutoffs + list(filter(lambda x: abs(x[-1][-1] - res[-1][-1][-1]) < tolerance, res))
        
        self.ideal_cutoff=8
        return ideal_cutoffs
        
    def cutoff_score(self, models):

        predictions = []

        X_train_, X_validation_, Y_train_, Y_validation_ = \
                            train_test_split(self.X_train, self.Y_train)

        for model in models:
            X_train_transformed = np.array([self.data_transformations[model[2]](_) for _ in X_train_])
            X_validation_transformed = np.array([self.data_transformations[model[2]](_) for _ in X_validation_])
            model = model[0].fit(X_train_transformed, Y_train_)
            predictions.append(model.predict(X_validation_transformed))
        
        predictions = np.array(predictions)
        predictions = np.sum(predictions, axis=0)
        
        cutoff_performances = []
        for i in range(1, len(models)):
            cutoff_performances.append((i, classifier_summary(Y_validation_, np.where(predictions>i, 1, 0))))
        return cutoff_performances

        
    def optimal_models(self):

        model_performances = []
        for learning_model in self.learning_models:
            learning_logger.info("Running " + learning_model['algorithm'].__name__)
            
            for params_index, params in enumerate(learning_model.get('params', [{}])):
                
                transformation_performances = {}
                
                for _, transformation in enumerate(self.data_transformations):
                    validation_performance = []
                    model = learning_model['algorithm'](**params)
                    for iteration in range(5):
                        
                        X_train_transformed = np.array([transformation(_) for _ in self.X_train])
                        X_test_transformed = np.array([transformation(_) for _ in self.X_test])

                        X_train_transformed, X_validation_transformed, Y_train_, Y_validation_ = \
                            train_test_split(X_train_transformed, self.Y_train)

                        model.fit(X_train_transformed, Y_train_)
                        validation_performance.append(classifier_summary(Y_validation_, model.predict(X_validation_transformed)))

                    transformation_performances[_] = np.array(validation_performance).mean(axis=0)

                    model_performances.append((model, 
                                              params_index,
                                               _,
                                              np.array(validation_performance).mean(axis=0)))
            
        return model_performances
        
    def cross_validate(self, model, X_train, Y_train):
        p = np.random.permutation(len(Y_train)//2)
        return Y_train[p], model.predict(X_train[p])

    def train(self):
        assert not self.ideal_models is None, "Ideal cutoff not set, check if approximate_cutoff has been run"
        ensemble = []
        if not self.X_train is None and not self.Y_train is None:
            for model in self.ideal_models:
                X_train_transformed = np.array([self.data_transformations[model[2]](_) for _ in self.X_train])
                _model = model[0].fit(X_train_transformed, self.Y_train)
                ensemble.append((deepcopy(_model), model[2]))
            
        self.ensemble = ensemble
        return True

    def test(self, ideal_cutoff=None):
        assert not self.ensemble is None, "Ensemble not found"
        if ideal_cutoff is None:
            ideal_cutoff = self.ideal_cutoff
        predictions = []
        for model in self.ensemble:
            X_test_transformed = np.array([self.data_transformations[model[1]](_) for _ in self.X_test])
            predictions.append(model[0].predict(X_test_transformed))
        predictions = np.sum(np.array(predictions), axis=0)

        return predictions, classifier_summary(self.Y_test, np.where(predictions > ideal_cutoff, 1, 0))


In [21]:
class KeystrokeLearningModel:
    def __init__(self, X_train, X_test, Y_train, Y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_test = Y_test
        
        self.ideal_models = None
        self.ensemble = None
        
        userKeyLogs = [[np.min(_), np.max(_)] for _ in self.X_train]
        userKeyLogsMinMax = tuple(np.mean(userKeyLogs, axis=0))
        
        self.data_transformations = [
            lambda x: np.array([np.histogram(keyLog, 
                                             bins=np.histogram(userKeyLogs, bins=50, range=userKeyLogsMinMax)[1])[0] 
                                for keyLog in x]),
            lambda x: np.array([np.histogram(keyLog, 
                                 bins=np.histogram(userKeyLogs, bins=50, range=userKeyLogsMinMax)[1])[0] 
                    for keyLog in x]),
        ]

        self.learning_models = [
            {"algorithm": SVC,
             "params": [
                 {'kernel':'linear', 'gamma':'auto', 'degree':3, 'class_weight':'balanced', 'cache_size':10}
             ]},
            {"algorithm": AdaBoostClassifier, 
             "params": [
                 {'n_estimators':100},
             ]},
            {"algorithm": GaussianNB},
            {"algorithm": RandomForestClassifier,
             "params": [
                 {'n_estimators': 20},
             ]},
            {"algorithm": BernoulliNB},
            {"algorithm": MultinomialNB},
            {"algorithm": SGDClassifier},
            {"algorithm": MLPClassifier,
             "params": [
                 {'max_iter': 1000},
             ]},
        ]
        
    def optimal_models(self, iterations=5):
        X_train, X_test, Y_train, Y_test = train_test_split(self.X_train, self.Y_train, test_size=0.2)
        res = []
        for i in range(iterations):
            for learning_model in self.learning_models:
                for params_index, params in enumerate(learning_model.get('params', [{}])):
                    for _, transformation in enumerate(self.data_transformations):
                        model = learning_model['algorithm'](**params)
                        model.fit(transformation(X_train), Y_train)
                        res.append((model, params_index, _, classifier_summary(Y_test, model.predict(transformation(X_test)))))
        
        ideal_models = sorted(filter(lambda x: x[-1][1] <=0.7, res), key = lambda x: (x[-1][0], -1*x[-1][-1]))
        self.ideal_models = ideal_models[:10]
        return self.ideal_models
    
    def train(self, iterations=5):
        if self.ideal_models is None:
            self.optimal_models(self.X_train, self.X_test, self.Y_train, self.Y_test, iterations=iterations)
        
        ensemble = []
        for model in self.ideal_models:
            ensemble.append((deepcopy(model[0].fit(self.data_transformations[model[2]](self.X_train), self.Y_train)), 
                            model[2]))
        
        self.ensemble = ensemble
        return True
    
    def test(self, ideal_cutoff=7, display_prediction=True):
        assert not self.ensemble is None, "Ensemble not calculated"
        predictions = []
        for model in self.ensemble:
            predictions.append(model[0].predict(self.data_transformations[model[1]](self.X_test)))
            
        predictions = np.sum(np.array(predictions), axis=0)
        if display_prediction:
            print(predictions)
            print(np.where(predictions > ideal_cutoff, 1, 0))
        return predictions, classifier_summary(self.Y_test, np.where(predictions > ideal_cutoff, 1, 0))


In [19]:
class Authenticator:
    def __init__(self, userid, sessions, classes=5, test_size=(0.20, 0.50), instances=5):
        self.user = UserModel(userid, sessions)
        self.user.train_test_data(classes=classes, test_size=test_size, instances=instances)       
        
        self.mouse_params = list(self.user.mouse_train_test_data())+[self.user.meanMouseLength]
        self.mouseLearningModel = MouseLearningModel(*self.mouse_params)
        
        self.keystroke_params = list(self.user.keystroke_train_test_data())
        self.keystrokeLearningModel = KeystrokeLearningModel(*self.keystroke_params)
        
        self.mouseResult = None
        self.keystrokeResult = None
        

    def train_mouseModel(self):
        optimal_models = self.mouseLearningModel.optimal_models()
        self.mouseLearningModel.approximate_cutoffs(optimal_models)
        return self.mouseLearningModel.train()
    
    def test_mouseModel(self, ideal_cutoff=10):
        self.mouseLearningModel.ideal_cutoff = ideal_cutoff
        res = self.mouseLearningModel.test()
        self.mouseResult = res
        return res
    
    def train_keystrokeModel(self):
        optimal_models = self.keystrokeLearningModel.optimal_models()
        return self.keystrokeLearningModel.train()

    def test_keystrokeModel(self, ideal_cutoff=9, display_predictions=False):
        res = self.keystrokeLearningModel.test(ideal_cutoff=ideal_cutoff, display_prediction=display_predictions)
        self.keystrokeResult = res
        return res
    
    def combined_score(self):
        return self.keystrokeResult[0]+self.mouseResult[0]
    
    def combined_summary(self, cutoff=20):
        combined_score = self.combined_score()
        prediction = np.where(combined_score >= cutoff, 1, 0)
        
        return combined_score, classifier_summary(self.mouseLearningModel.Y_test, prediction)



In [30]:
authenticator = Authenticator(13, sessions, classes=2)

authenticator.train_mouseModel()
authenticator.train_keystrokeModel()

authenticator.test_keystrokeModel()
authenticator.test_mouseModel()

authenticator.combined_summary()

INFO:DATA:Selected classes [51  4]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier


(array([22,  6,  1,  6, 22,  0, 22, 22, 22,  1]), [0.0, 0.0, 1.0])

## Impact of changing number of classes in the study

In [51]:
# select five classes at random
import random
random_classes = random.choices(range(1,51), k=10)
changing_number_of_classes = {}

In [52]:
keyboard_performance = []
mouse_performance = []
combined_performance = []
classes = [2, 10, 20, 30, 40, 50]
for _classes in classes:
    print(_classes)
    for each in random_classes:
        authenticator = Authenticator(each, sessions, classes=_classes, instances=10)

        authenticator.train_mouseModel()
        mouse_performance.append(authenticator.test_mouseModel()[-1])

        authenticator.train_keystrokeModel()
        keyboard_performance.append(authenticator.test_keystrokeModel()[-1])

        combined_performance.append(authenticator.combined_summary()[-1])
    changing_number_of_classes[_classes] = (np.array(keyboard_performance),
                                           np.array(mouse_performance),
                                           np.array(combined_performance))

INFO:DATA:Selected classes [35 36]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier


2


INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [11 40]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [22 39]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [43 37]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
I

10


INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [50 48  9  7 12 15 13 49 16 20]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [39 14 18 11  7 38 45 36 55 15]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [ 7 44 28  3 15 34  

20


INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [36 15 14 17 13 47 21 42 45 35 51 48 39 54 38  1 25  2 33  3]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running MLPClassifier
INFO:DATA:Selected classes [10  9 36 22 47 38 17 33 11 41 44  5 37 43 50 21 24 55 49  6]
INFO:LEARNING:Running SVC
INFO:LEARNING:Running AdaBoostClassifier
INFO:LEARNING:Running GaussianNB
INFO:LEARNING:Running RandomForestClassifier
INFO:LEARNING:Running BernoulliNB
INFO:LEARNING:Running MultinomialNB
INFO:LEARNING:Running SGDClassifier
INFO:LEARNING:Running M

KeyboardInterrupt: 

In [53]:
changing_number_of_classes

{2: (array([[0.3       , 0.        , 0.8       ],
         [0.        , 0.        , 1.        ],
         [0.2       , 0.2       , 0.8       ],
         [0.        , 0.        , 1.        ],
         [0.2       , 0.4       , 0.73333333],
         [0.        , 0.2       , 0.93333333],
         [0.1       , 0.2       , 0.86666667],
         [0.        , 0.        , 1.        ],
         [0.4       , 0.        , 0.73333333],
         [0.2       , 0.4       , 0.73333333]]),
  array([[0.        , 0.        , 1.        ],
         [0.        , 0.        , 1.        ],
         [0.        , 0.6       , 0.8       ],
         [0.        , 0.4       , 0.86666667],
         [0.        , 0.        , 1.        ],
         [0.        , 0.        , 1.        ],
         [0.5       , 0.        , 0.66666667],
         [0.        , 0.        , 1.        ],
         [0.        , 0.        , 1.        ],
         [0.        , 1.        , 0.66666667]]),
  array([[0.        , 0.        , 1.        ],
      

In [47]:
np.mean(np.array(combined_performance), axis=0)

array([0.02, 0.32, 0.88])

## Impact of changing number of hidden classes


In [None]:
len(new_users)

## ROC curves for 
### 1. Mouse
### 2. Keystroke
### 3. Combined

In [None]:
keystroke = []
mouse = []
combined = []
for i in range(1,5):
    authenticator = Authenticator(i, sessions, classes=30, test_size=(0.20, 0.5), instances=10)

    _= authenticator.train_mouseModel()
    _= authenticator.train_keystrokeModel()

    res = []
    for cutoff in range(1,13):
        authenticator.test_mouseModel(ideal_cutoff=cutoff)
        res.append(authenticator.mouseResult[1][0:-1])
    res = np.array(res)
    mouse.append(res)
    res = []
    for cutoff in range(1,13):
        authenticator.test_keystrokeModel(ideal_cutoff=cutoff)
        res.append(authenticator.keystrokeResult[1][0:-1])
    res = np.array(res)
    keystroke.append(res)
    res = []
    for cutoff in range(15,21):
        res.append(authenticator.combined_summary(cutoff=cutoff)[1][0:-1])
    res = np.array(res)
    combined.append(res)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt 
plt.xlabel('False Acceptance Rate')
plt.ylabel('True Positive Rate')
plt.xlim((0,1))
plt.ylim((0,1))
for each in np.array(mouse):
    plt.plot(each[:,0], 1-each[:,1], linewidth=2)
    plt.pause(1)
plt.show()

In [None]:
combined

In [14]:
class UserModel:
    def __init__(self, userid, sessions):
        """
        Initiate the UserModel object with sessions divided into different sets
        """
        self.userid = userid
        self.X_train = None
        self.Y_train = None
        self.X_test = None
        self.Y_test = None

        self.SessionsX= []
        self.SessionsY = []
        self.othersSessionsX= []
        self.othersSessionsY= []
        
        self.seenUsers = None
        
        for session in sessions: #dataX is a collection of users
            if session.userid == self.userid:
                self.SessionsX.append(session)
                self.SessionsY.append(self.userid)
            else:
                self.othersSessionsX.append(session)
                self.othersSessionsY.append(session.userid)
                    
                    
        self.meanMouseLength = int(np.mean([len(_.mouseLog) for _ in self.SessionsX]))
        self.stdMouseLength = int(np.std([len(_.mouseLog) for _ in self.SessionsX]))
        
        self.SessionsX, self.SessionsY, self.othersSessionsX, self.othersSessionsY = \
            np.array(self.SessionsX), np.array(self.SessionsY), np.array(self.othersSessionsX), np.array(self.othersSessionsY)
        
    def train_test_data(self, 
                        test_size=(0.20, 0.50), 
                        classes=0, 
                        binary_classes=False, 
                        instances=5, 
                        random=3):
        """
        Return and set training and test dataset for self
        
        Keyword Arguments:
        test_size (float, float): test size for splitting the dataset of self vs other classes.
        classes (int): Number of other user User(s) to consider.
        instances (int): Number of examples to take from the considered classes.
        
        """     
        
        X_train, X_test, Y_train, Y_test = \
            train_test_split(self.SessionsX, self.SessionsY, test_size=test_size[0], random_state=random)
        
        if classes == 0:
            self.X_train, self.X_test, self.Y_train, self.Y_test = \
                X_train, X_test, Y_train, Y_test
            
        elif classes > 0:
            assert instances <= 25, "Too many instances requested"
            other_classes = np.unique(self.othersSessionsY[self.othersSessionsY != self.userid])
            assert len(other_classes) >= classes, "Too many classes"
            
            np.random.shuffle(other_classes)
            other_classes = other_classes[:classes]
            self.seenUsers = other_classes
            
            data_logger.info("Selected classes "+str(other_classes))
            
            dataX = []
            dataY = []

            for each in other_classes:
                for session in self.othersSessionsX[self.othersSessionsY == each][:instances]:
                    dataX.append(session)
                    if binary_classes:
                        dataY.append(-1)
                    else:
                        dataY.append(each)

            X_train_other, X_test_other, Y_train_other, Y_test_other = \
                train_test_split(np.array(dataX), np.array(dataY), test_size=test_size[1], random_state=random)

            self.X_train = np.concatenate((X_train, X_train_other), axis=0)
            self.X_test = np.concatenate((X_test, X_test_other), axis=0)
            self.Y_train = np.concatenate((Y_train, Y_train_other), axis=0)
            self.Y_test = np.concatenate((Y_test, Y_test_other), axis=0)
        
            #Shuffle the data rows
            assert len(self.X_test) == len(self.Y_test)
            p = np.random.permutation(len(self.X_test))
            self.X_test = self.X_test[p]
            self.Y_test = self.Y_test[p]
            
        
        return self.X_train, self.X_test, self.Y_train, self.Y_test #other_classes
    
    def mouse_train_test_data(self, binary_classes=True):
        """
        Return and set training and test dataset for self
        """     
        assert all([not each is None for each in [self.X_train,
                                             self.X_test,
                                             self.Y_train,
                                             self.Y_test]]), "Run train_test_data first"
        
        X_train = np.array([session.mouseLog for session in self.X_train])
        X_test = np.array([session.mouseLog for session in self.X_test])
        Y_train = self.Y_train.copy()
        Y_test = self.Y_test.copy()
        
        if binary_classes:
            Y_train = np.where(Y_train == self.userid, 1, 0)
            Y_test = np.where(Y_test == self.userid, 1, 0)
        return X_train, X_test, Y_train, Y_test
    
    def keystroke_train_test_data(self, binary_classes=True):
        assert all([not each is None for each in [self.X_train,
                                     self.X_test,
                                     self.Y_train,
                                     self.Y_test]]), "Run train_test_data first"
        
        X_train = np.array([session.keystrokeLog for session in self.X_train])
        X_test = np.array([session.keystrokeLog for session in self.X_test])
        Y_train = self.Y_train.copy()
        Y_test = self.Y_test.copy()

        if binary_classes:
            Y_train = np.where(Y_train == self.userid, 1, 0)
            Y_test = np.where(Y_test == self.userid, 1, 0)
        
        return X_train, X_test, Y_train, Y_test




In [27]:
class Session:
    def __init__(self, userid, mouseLog, keystrokeLog):
        self.userid = userid
        self.mouseLog = mouseLog
        self.keystrokeLog = keystrokeLog
        
min_sessions = 25
sessions = []
for _, user in enumerate(valid_participants):
    user_sessions = []
    for session in user:   
        mouseLog = np.array(session['mouseLogs'][0], dtype='int32') 
        keyLog = np.array(list(map(lambda x: x['duration'], session['passwordLogs']+session['usernameLogs'])), dtype='int32') 
        
        if len(mouseLog) and len(keyLog):
            user_sessions.append(Session(_+1, mouseLog, keyLog))
        
    if len(user_sessions) >= min_sessions:
        sessions += user_sessions[-1*min_sessions:]


In [55]:
# with open('valid_participants', 'rb') as f:
#     try:
#         pickle.load(f)
#     except:
#         return "It seems you did't dump the dataset in the first place"
#     return pickle.load(f)
        

with open('valid_participants', 'wb') as f:
    pickle.dump(sessions, f)