<a href="https://colab.research.google.com/github/dylansoemitro/MouseBiometrics/blob/main/ML_Models_Balabit_Hyperparameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Dependencies

In [None]:
!pip install -U scikit-learn
!pip install -U imbalanced-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.1 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 4.0 MB/s 
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.4.3
    Uninstalling imbalanced-learn-0.4.3:
      Successfully uninstalled imbalanced-learn-0.4.3
Successfully installed imbal

In [None]:
!pip install -U collinearity

Collecting collinearity
  Downloading collinearity-0.6.1.tar.gz (5.0 kB)
Building wheels for collected packages: collinearity
  Building wheel for collinearity (setup.py) ... [?25l[?25hdone
  Created wheel for collinearity: filename=collinearity-0.6.1-py3-none-any.whl size=4459 sha256=b17069cdfa44b6553a5825a3431104a3293b97e06e1d6bb7436de20d87897151
  Stored in directory: /root/.cache/pip/wheels/31/bf/74/0a475ad9095545c56fe02d678ccd38739baa81513e877d91ca
Successfully built collinearity
Installing collected packages: collinearity
Successfully installed collinearity-0.6.1


Imports

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import glob
import random
import time
import pickle
import csv
from datetime import datetime
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from pathlib import Path

%tensorflow_version 2.x  # this line is not required unless you are in a notebook
import tensorflow as tf
import tensorflow.compat.v2.feature_column as fc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2
from collinearity import SelectNonCollinear

from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x  # this line is not required unless you are in a notebook`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


Link google drive to access datasets

In [None]:
import sys
import os
from google.colab import drive

# enable use of google drive
drive.mount('/content/drive')

# set the working directory
% cd /content/drive/My Drive/Colab Notebooks/2021 Mouse Biometrics Internship/

# link to project directory
project_folder = "/content/drive/My Drive/Colab Notebooks/2021 Mouse Biometrics Internship/"

# clean of previous directories from sys
while(True):
  try:
    sys.path.remove(project_folder)
  except ValueError:
    break
# append new path
sys.path.append(project_folder)

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1k64oZq9UGrPEz-MMYl1rpRwF_LjXSCrB/2021 Mouse Biometrics Internship


# Preprocessing

## BB-MAS

In [None]:
bb_mas_path = "Data/PrepedCSVs/BB-MAS.csv"
bb_mas_df = pd.DataFrame()

In [None]:
if bb_mas_df.size == 0:
  bb_mas_df = pd.read_csv(bb_mas_path)

## TWOS

In [None]:
twos_df = pd.DataFrame()
twos_path = "Data/PrepedCSVs/TWOS.csv"

In [None]:
if twos_df.size == 0:
  twos_df = pd.read_csv(twos_path)

## Balabit

In [None]:
balabit_df = pd.DataFrame()
balabit_path = "Data/PrepedCSVs/Balabit.csv"

In [None]:
if balabit_df.size == 0:
  balabit_df = pd.read_csv(balabit_path)

# Features

## Window generator functions

In [None]:
def time_window(dataframe, delta_time, shift=None, drop_remainder=False):
  if len(dataframe) == 0:
    return
  p0 = dataframe.index[0]
  while True:
    p = p0
    while dataframe["time"][p] < delta_time + dataframe["time"][p0]:
      p += 1
      if p == dataframe.index[-1]:
        if not drop_remainder:
          yield dataframe.loc[p0:]
        return
    
    yield dataframe.loc[p0:p]
    if shift is None: p0 = p
    else: 
      while dataframe["time"][p0] < shift + dataframe["time"][p]:
        p0 += 1

## Feature Functions

In [None]:
# returns the total time in seconds between the interval or the delta times between each mouse move
def get_elapsed_time(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  return user_data["time"][end] - user_data["time"][start]

In [None]:
# returns distance in the x direction
def get_x_distance(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  return user_data["px"][end] - user_data["px"][start]

In [None]:
# returns distance in the y direction
def get_y_distance(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  return user_data["py"][end] - user_data["py"][start]

In [None]:
# returns distance 
def get_euclidean_distance(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  xdist = get_x_distance(user_data, start, end)
  ydist = get_y_distance(user_data, start, end)
  edist = np.sqrt(np.square(xdist) + np.square(ydist))
  return edist

In [None]:
def get_manhattan_distance(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  return get_x_distance(user_data, start, end) + get_y_distance(user_data, start, end)

In [None]:
# returns velocity in the x direction
def get_x_velocity(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if get_elapsed_time(user_data, start, end) == 0:
    return 0
  else:
    return get_x_distance(user_data, start, end) / get_elapsed_time(user_data, start, end)

In [None]:
# returns velocity in the y direction
def get_y_velocity(user_data, start, end):
    if end > user_data.index[-1]:
      return 0
    if get_elapsed_time(user_data, start, end) == 0:
      return 0
    else:
      return get_y_distance(user_data, start, end) / get_elapsed_time(user_data, start, end)

In [None]:
# return speed
def get_speed(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if get_elapsed_time(user_data, start, end) == 0:
    return 0
  else:
    return get_euclidean_distance(user_data, start, end) / get_elapsed_time(user_data, start, end)

In [None]:
# TODO: verify accuracy
# return angular velocity
def get_angular_velocity(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  time = get_elapsed_time(user_data, start, end)
  point1 = (user_data["px"][start], user_data["py"][start])
  point2 = (user_data["px"][end], user_data["py"][end])
  ang1 = np.arctan2(*point1[::-1])
  ang2 = np.arctan2(*point2[::-1])
  ang_between = np.rad2deg((ang1 - ang2) % (2 * np.pi))
  if time == 0:
    return 0
  else:
    return ang_between / time

In [None]:
# returns linear acceleration
def get_acceleration(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if end - start < 3:
    return 0
  delta_time = get_elapsed_time(user_data, start, end)
  start_speed = get_speed(user_data, start, start+1)
  end_speed = get_speed(user_data, end-1, end)
  if delta_time == 0:
    return 0
  else:
    return (end_speed - start_speed) / delta_time

In [None]:
def get_jerk(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if end - start < 4:
    return 0
  delta_time = get_elapsed_time(user_data, start, end)
  start_acceleration = get_acceleration(user_data, start, start+3)
  end_acceleration = get_acceleration(user_data, end-3, end)
  if delta_time == 0:
    return 0
  else:
    jerk = (end_acceleration - start_acceleration) / delta_time
    return jerk

In [None]:
# TODO: verify accuracy
# returns curvature
def get_curvature(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = get_euclidean_distance(user_data, start, end)
  point1 = (user_data["px"][start], user_data["py"][start])
  point2 = (user_data["px"][end], user_data["py"][end])
  ang1 = np.arctan2(*point1[::-1])
  ang2 = np.arctan2(*point2[::-1])
  ang_between = np.rad2deg((ang1 - ang2) % (2 * np.pi))

  curv = ang_between / dist
  if np.isnan(curv) or np.isinf(curv):
      return 0
  return curv

In [None]:
def get_curvature_change(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if end - start < 3:
    return 0
  start_curv = get_curvature(user_data, start, start+1)
  end_curv = get_curvature(user_data, end-1, end)
  dist = get_euclidean_distance(user_data, start, end)
  
  dcurv = (end_curv - start_curv) / dist
  if np.isnan(dcurv) or np.isinf(dcurv):
      return 0
  return dcurv

In [None]:
def get_critical_points(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  if abs(get_curvature(user_data, start, end))>np.pi/10:
    return 1
  else:
    return 0

In [None]:
def get_direction(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  ydist = get_y_distance(user_data, start, end)
  if ydist == 0:
    return 0;
  xdist = get_x_distance(user_data, start, end)
  if xdist == 0:
    return np.pi / 2
  return np.arctan(ydist/xdist)

In [None]:
def get_angle_of_curvature(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  d1 = get_euclidean_distance(user_data, start+1, start)
  d3 = get_euclidean_distance(user_data, start+1, start+2)
  d2 = get_euclidean_distance(user_data, start, start+2)
  numerator = np.square(d1)+np.square(d2)-np.square(d3)

  denominator = 2*get_euclidean_distance(user_data, start+1, start)*get_euclidean_distance(user_data, start+1, start+2)
  if denominator == 0:
    return 0
  else:
    return np.arccos(numerator/denominator)

In [None]:
def get_curvature_distance(user_data, start, end):
  if start+2 > user_data.index[-1]:
      return 0
  
  dx = get_x_distance(user_data, start, start+2)
  dy = get_x_distance(user_data, start, start+2)
  numerator = dy*user_data["px"][start+1]+dx*user_data["py"][start+1]+(user_data["px"][start]*user_data["py"][start+2]-user_data["px"][start+2]*user_data["py"][start])
  distance_xy_2 = np.sqrt(np.square(dx)+np.square(dy))
  if distance_xy_2 == 0:
    return 0
  else: 
    return numerator/distance_xy_2

In [None]:
def get_angle(user_data, start, end):
  if end > user_data.index[-1]:
      return 0
  if end - start < 3:
    return 0
  numerator = np.square(get_euclidean_distance(user_data, start, user_data.index[0]))+np.square(get_euclidean_distance(user_data, start, user_data.index[-1]))-np.square(get_euclidean_distance(user_data,  user_data.index[0], user_data.index[-1]))
  denominator = 2*get_euclidean_distance(user_data,  user_data.index[0], start)*get_euclidean_distance(user_data, start, user_data.index[-1])
  if denominator == 0 or numerator/denominator < -1 or numerator/denominator>0:
    return 0
  else:
    return np.arccos(numerator/denominator)

In [None]:
#not done
def get_curve_length_ratio(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  return get_euclidean_distance(user_data, start, end)
  

In [None]:
def get_straightness(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  tot_dist = get_euclidean_distance(user_data, user_data.index[0], user_data.index[-1])
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  if dist == 0:
    return 0
  return tot_dist / dist
  

In [None]:
def get_trajectory_center_of_mass(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  elapsed_time = get_elapsed_time(user_data, start,end)
  current_distance = get_euclidean_distance(user_data,start,end)
  if dist == 0:
    return 0
  else:
    return (current_distance*elapsed_time)/dist


  

In [None]:
def get_scattering_coefficient(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  TCM = 0
  for j in user_data.index[:-1]:
    TCM += get_trajectory_center_of_mass(user_data, start, end)
  elapsed_time = get_elapsed_time(user_data, start,end)
  current_distance = get_euclidean_distance(user_data,start,end)
  if dist == 0:
    return 0
  else:
    return (current_distance*np.square(elapsed_time)-np.square(TCM))/dist
  

In [None]:
def get_third_moment(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  elapsed_time = get_elapsed_time(user_data, start,end)
  current_distance = get_euclidean_distance(user_data,start,end)
  if dist == 0:
    return 0
  else:
    return (current_distance*(elapsed_time**3))/dist


In [None]:
def get_fourth_moment(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  elapsed_time = get_elapsed_time(user_data, start,end)
  current_distance = get_euclidean_distance(user_data,start,end)
  if dist == 0:
    return 0
  else:
    return (current_distance*(elapsed_time**4))/dist


In [None]:
#??????????????
def get_trajectory_curvature(user_data, start, end):
  if end > user_data.index[-1]:
    return 0
  dist = 0
  for i in user_data.index[:-1]:
    dist += get_euclidean_distance(user_data, i, i+1)
  elapsed_time = get_elapsed_time(user_data, start,end)
  current_distance = get_euclidean_distance(user_data,start,end)
  return (current_distance*(elapsed_time**3))/dist


In [None]:

def get_deviation(user_data, start, end):
  if end > user_data.index[-1]:
      return 0
  dx = get_x_distance(user_data, user_data.index[0], user_data.index[-1])
  dy = get_y_distance(user_data, user_data.index[0], user_data.index[-1])
  
  numerator = dy*user_data["px"][start]+dx*user_data["py"][start]+(user_data["px"][user_data.index[0]]*user_data["py"][user_data.index[1]]-user_data["px"][user_data.index[1]]*user_data["py"][user_data.index[0]])
  distance_xy = np.sqrt(np.square(dx)+np.square(dy))
  if distance_xy == 0:
    return 0
  else:
    return numerator/distance_xy

In [None]:
def get_velocity_curvature(user_data, start, end):
  if end > user_data.index[-1]:
      return 0
  jerk = get_jerk(user_data, start, end)
  acceleration = get_acceleration(user_data, start, end)
  if acceleration == 0:
    return 0
  else:
    return jerk/((1+acceleration**2)**(3/2))

In [None]:
# wrapper for feature functions
class feature:
  def __init__(self, feature_func, return_func, offset=1):
    self.feature_func = feature_func
    self.return_func = return_func
    self.offset = offset
  
  def get_feature(self, user_data):
    flist = []
    if self.offset is None:
      flist.append(self.feature_func(user_data, user_data.index[0], user_data.index[-1]))
    else:
      for i in user_data.index:
        f = self.feature_func(user_data, i, i+self.offset)
        if np.isnan(f):
          continue
        flist.append(f)

    return self.return_func(flist)

Custom return functions

In [None]:
def nz_max(input_array):
  new_array = [i for i in input_array if i != 0]
  if len(new_array) == 0:
    return 0
  else:
    return np.max(new_array)

def nz_min(input_array):
  new_array = [i for i in input_array if i != 0]
  if len(new_array) == 0:
    return 0
  else:
    return np.min(new_array)

def nz_range(input_array):
  return nz_max(input_array)-nz_min(input_array)

def nz_std(input_array):
  new_array = [i for i in input_array if i != 0]
  if len(new_array) == 0:
    return 0
  else:
    return np.std(new_array)


def nz_mean(input_array):
  new_array = [i for i in input_array if i != 0]
  if len(new_array) == 0:
    return 0
  else:
    return np.mean(new_array)



## Feature Extraction

In [None]:
def extract_features(train_path, test_path, data, user_ids, feature_functions, test_size=0.3, min_data=2000):

  window_length = 5000  # delta_time
  window_offset = None  # offset_time

  cols = list(["user_id"]) + list(feature_functions.keys())
  train_df = pd.DataFrame(columns=cols)
  test_df = pd.DataFrame(columns=cols)
  res = "y"
  if (os.path.exists(train_path) or os.path.exists(test_path)):
    res = input("Do you want to overwrite feature data?\n[y][n]: ")
  if res == "y":
    for user in user_ids: 
      if len(train_df.index)==0 and os.path.exists(train_path):
        exist = False
        with open(train_path, 'rt') as f:
          s = csv.reader(f, delimiter=",")
          for row in s:
            if str(user) in row[0]:
              exist = True
              continue
        if exist == True:
          continue
      if user != 0.0:
        user_data = data[data["user_id"] == user]
        if len(user_data.index) < min_data:
          print("Skipping user: {}".format(user), len(user_data.index), len(user_data))
          continue
        print("Extracting features from user: {}".format(user))
        # init feature dict
        user_feature = {key : [] for key in cols}

        # extract_features
        for window in time_window(user_data, window_length, window_offset):
          user_feature["user_id"].append(user)
          for key in feature_functions:
            f = feature_functions[key].get_feature(window)
            if np.isnan([f]): 
              print(key, f)
            user_feature[key].append(f)
        # clear_output()

        # split training and testing
        user_train, user_test = train_test_split(pd.DataFrame.from_dict(user_feature), test_size=0.3, shuffle=False)
        train_df = pd.concat([train_df, user_train])
        test_df = pd.concat([test_df, user_test])
        if user == user_ids[0]:
          train_df.to_csv(train_path, index=False)
          test_df.to_csv(test_path, index=False)
        else:
          train_df = train_df.reset_index(drop = True)
          test_df = test_df.reset_index(drop = True)
      
          with open(train_path, 'a') as f:
            train_df.to_csv(f, header=False, index=False)
          with open(test_path, 'a') as f:
            test_df.to_csv(f, header=False, index=False)
    
  else:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
  
  return train_df, test_df

In [None]:
# dict of functions to call
feature_functions = {
  "elapsed_time": feature(get_elapsed_time, np.sum),
  "critical_points": feature(get_critical_points, np.sum),
  "stroke-length": feature(get_euclidean_distance, np.sum),
  "straightness": feature(get_straightness, nz_max),
  "trajectory_center_of_mass": feature(get_trajectory_center_of_mass, np.sum),
  #"scattering_coefficient": feature(get_scattering_coefficient, np.sum),
  "third_moment": feature(get_third_moment, np.sum),
  "fourth_moment": feature(get_fourth_moment, np.sum),
  "velocity_curvature": feature(get_velocity_curvature, nz_mean, 4),
 
  "xvelocity-mean": feature(get_x_velocity, nz_mean),
  "xvelocity-maximum": feature(get_x_velocity, nz_max), 
  "xvelocity-minimum": feature(get_x_velocity, nz_min), 
  "xvelocity-std": feature(get_x_velocity, nz_std), 
  "xvelocity-range": feature(get_x_velocity, nz_range),
  "yvelocity-mean": feature(get_y_velocity, nz_mean),
  "yvelocity-maximum": feature(get_y_velocity, nz_max), 
  "yvelocity-minimum": feature(get_y_velocity, nz_min), 
  "yvelocity-std": feature(get_y_velocity, nz_std), 
  "yvelocity-range": feature(get_y_velocity, nz_range),
  "tangential-velocity-mean": feature(get_speed, nz_mean),
  "tangential-velocity-maximum": feature(get_speed, nz_max), 
  "tangential-velocity-minimum": feature(get_speed, nz_min), 
  "tangential-velocity-std": feature(get_speed, nz_std), 
  "tangential-velocity-range": feature(get_speed, nz_range),
  "acceleration-mean": feature(get_acceleration, nz_mean,3),
  "acceleration-maximum": feature(get_acceleration, nz_max,3), 
  "acceleration-minimum": feature(get_acceleration, nz_min,3), 
  "acceleration-std": feature(get_acceleration, nz_std,3), 
  "acceleration-range": feature(get_acceleration, nz_range,3), 
  "jerk-mean": feature(get_jerk, nz_mean, 4),
  "jerk-maximum": feature(get_jerk, nz_max, 4), 
  "jerk-minimum": feature(get_jerk, nz_min, 4), 
  "jerk-std": feature(get_jerk, nz_std, 4), 
  "jerk-range": feature(get_jerk, nz_range, 4),  
  "angular_velocity-mean": feature(get_angular_velocity, nz_mean),
  "angular_velocity-maximum": feature(get_angular_velocity, nz_max), 
  "angular_velocity-minimum": feature(get_angular_velocity, nz_min), 
  "angular_velocity-std": feature(get_angular_velocity, nz_std), 
  "angular_velocity-range": feature(get_angular_velocity, nz_range),
  "curvature-mean": feature(get_curvature, nz_mean),
  "curvature-maximum": feature(get_curvature, nz_max), 
  "curvature-minimum": feature(get_curvature, nz_min), 
  "curvature-std": feature(get_curvature, nz_std), 
  "curvature-range": feature(get_curvature, nz_range),
  "curvature_change-mean": feature(get_curvature_change, nz_mean,3),
  "curvature_change-maximum": feature(get_curvature_change, nz_max,3), 
  "curvature_change-minimum": feature(get_curvature_change, nz_min,3), 
  "curvature_change-std": feature(get_curvature_change, nz_std,3), 
  "curvature_change-range": feature(get_curvature_change, nz_range,3),
  "direction-mean": feature(get_direction, nz_mean),
  "direction-maximum": feature(get_direction, nz_max), 
  "direction-minimum": feature(get_direction, nz_min), 
  "direction-std": feature(get_direction, nz_std), 
  "direction-range": feature(get_direction, nz_range),
  "angle-mean": feature(get_angle, nz_mean,3),
  "angle-maximum": feature(get_angle, nz_max,3), 
  "angle-minimum": feature(get_angle, nz_min,3), 
  "angle-std": feature(get_angle, nz_std,3), 
  "angle-range": feature(get_angle, nz_range,3),
  "curvature_distance-mean": feature(get_curvature_distance, nz_mean),
  "curvature_distance-maximum": feature(get_curvature_distance, nz_max), 
  "curvature_distance-minimum": feature(get_curvature_distance, nz_min), 
  "curvature_distance-std": feature(get_curvature_distance, nz_std), 
  "curvature_distance-range": feature(get_curvature_distance, nz_range),
  "deviation-mean": feature(get_deviation, nz_mean),
  "deviation-maximum": feature(get_deviation, nz_max), 
  "deviation-minimum": feature(get_deviation, nz_min), 
  "deviation-std": feature(get_deviation, nz_std), 
  "deviation-range": feature(get_deviation, nz_range)
  # "angle_of_curvature-mean": feature(get_angle_of_curvature, nz_mean),
  # "angle_of_curvature-maximum": feature(get_angle_of_curvature, nz_max), 
  # "angle_of_curvature-minimum": feature(get_angle_of_curvature, nz_min), 
  # "angle_of_curvature_curvature-std": feature(get_angle_of_curvature, nz_std), 
  # "angle_of_curvature-range": feature(get_angle_of_curvature, nz_range),  
}

In [None]:
#   # bb-mas features
# bb_mas_train_df, bb_mas_test_df = extract_features(
#    "Data/Features/Training_BB-MAS_Features.csv",
#    "Data/Features/Testing_BB-MAS_Features.csv",
#     bb_mas_df, bb_mas_df["user_id"].unique(), 
#    feature_functions, 0.3
# )

# # # twos features
# twos_train_df, twos_test_df = extract_features(
#  "Data/Features/Training_TWOS_Features.csv",
#  "Data/Features/Testing_TWOS_Features.csv",
#  twos_df, twos_df["user_id"].unique(), 
#  feature_functions, 0.3
# )

balabit_train_df, balabit_test_df = extract_features(
 "Data/Features/Training_Balabit_Features.csv",
 "Data/Features/Testing_Balabit_Features.csv",
 balabit_df, balabit_df["user_id"].unique(), 
 feature_functions, 0.3
)

Do you want to overwrite feature data?
[y][n]: n


# Models

A utility method to create a tf.data dataset from a Pandas Dataframe

In [None]:
# def df_to_dataset(dataframe, y_label, shuffle=True, batch_size=32):
#   dataframe = dataframe.copy() 
#   labels = dataframe.pop(y_label)
#   ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
#   if shuffle:
#     ds = ds.shuffle(buffer_size=len(dataframe))
#   ds = ds.batch(batch_size)
#   return ds

def df_to_dataset(X, y, shuffle=True, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices((X, y))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
# get X, y dataframes
def prep_feature_df(user_data, user_id):
  y = [ [int(user_id == i)] for i in user_data["user_id"]]
  X = user_data.copy()
  X.pop("user_id")
  return X, y

## ML Models

In [None]:
#hyperparamater optimization

def LogisticRegression_optimize(X, y, path, model):
  solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
  penalty = ['none', 'l1', 'l2', 'elasticnet']
  c_values = [100, 10, 1.0, 0.1, 0.01]
  # define grid search
  grid = dict(solver=solvers,penalty=penalty,C=c_values)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
  grid_result = grid_search.fit(X, y)
  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))
  params['score'] = means
  pd.DataFrame.from_dict(params).to_csv(path + "/Best_Params_LR.csv")


def KNN_optimize(X, y, path, model):
  n_neighbors = range(1, 21, 2)
  #weights = ['uniform', 'distance']
  #metric = ['euclidean', 'manhattan', 'minkowski', 'mahalanobis']
  weights = ['distance']
  metric = ['mahalanobis']
  # define grid search
  grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
  grid_result = grid_search.fit(X, y)
  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))
  params['score'] = means
  pd.DataFrame.from_dict(params).to_csv(path + "/Best_Params_KNN.csv")

def SVM_optimize(X, y, path, model):
  kernel = ['poly', 'rbf', 'sigmoid']
  C = [100, 10, 1.0, 0.1, 0.01]
  gamma = ['scale']
  # define grid search
  grid = dict(kernel=kernel,C=C,gamma=gamma)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
  grid_result = grid_search.fit(X, y)
  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))
  params['score'] = means
  pd.DataFrame.from_dict(params).to_csv(path + "/Best_Params_SVM.csv")

def RandomForest_optimize(X, y, path, model):
  n_estimators = [10, 100, 1000]
  max_features = ['sqrt', 'log2']
  # define grid search
  grid = dict(n_estimators=n_estimators,max_features=max_features)
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
  grid_result = grid_search.fit(X, y)
  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))
  params['score'] = means
  pd.DataFrame.from_dict(params).to_csv(path + "/Best_Params_RF.csv")


In [None]:
# models = {
#   "SVM": svm.SVC()
#   "KNN-7": KNeighborsClassifier(n_neighbors=7),
#   "KNN-9": KNeighborsClassifier(n_neighbors=9),
#   "Random Forest": RandomForestClassifier(max_depth=2, random_state=0),
#   "Random Forest2": RandomForestClassifier(random_state=1),
#   "KNN-5": KNeighborsClassifier(n_neighbors=5),
#   "KNN-3": KNeighborsClassifier(n_neighbors=3),
#   "LR": LogisticRegression(random_state=0),
#   "GNB": GaussianNB()
# }

models = {
  #"SVM": svm.SVC(),
  "KNN": KNeighborsClassifier(),
  # "Random Forest": RandomForestClassifier(),
  # "LR": LogisticRegression()
}
user_id = balabit_df["user_id"].unique()
accuracy = {key: [] for key in ["user_id"] + list(models.keys())}
k_best = {}
train_df = balabit_train_df
#print(user_id)
random.seed(12345)
sample_users = balabit_train_df["user_id"].unique()
sample_users = [7]
#print("Sampled Users: ", sample_users)
for user in sample_users:
  print("User {}".format(user))
  X_train, y_train = prep_feature_df(train_df, user)

  # make sure there are enough true users
  if (np.sum(y_train) < 10):
    continue
  # oversample classes
  sm = SMOTE(sampling_strategy="minority", random_state=0)
  X_res, y_train = sm.fit_resample(X_train, y_train)
  X_train = pd.DataFrame(X_res, columns=X_train.columns)

  # make feature profile
  selector = SelectNonCollinear(0.8)
  selector.fit(np.array(X_train), np.array(y_train))
  mask = selector.get_support()
  X_train = X_train[X_train.columns[mask]]

  fs = SelectPercentile(percentile=50)
  fs.fit_transform(X_train, y_train)
  mask = fs.get_support()
  X_train = X_train[X_train.columns[mask]]

  k_best[user] = X_train.columns

  # print(k_best)

  cv = 5
  #for i in range(cv):
    #accuracy["user_id"].append(user)
  path = "Models/Balabit/{}".format(user)
  if os.path.isdir(path) == False:
    print("new path")
    os.makedirs(path)
  # if os.path.isdir(path) == False:
  #   print("new path")
  #   os.mkdir(path)
  # path2 = "Features/BB-MAS/{}".format(user)
  # os.mkdir(path2)
  # with open("Features/BB-MAS/{}.pkl".format(user), 'wb') as f:
  #     pickle.dump(,f)


  for key in models:
    print(key)
    # models[key].fit(X_train, y_train)
    # # accuracy[key].append(models[key].score(X_train, y_train))
    # score = cross_val_score(models[key], X_train, y_train, cv=5, scoring = "f1")
    # #score = models[key].score(X_train, y_train)
    # #for i in score:
    # accuracy[key].append(score)
  #   file = Path(path + "/{}.pkl".format(key))
  #   if (file.is_file() == False):
  #     with open(path + "/{}.pkl".format(key), 'wb') as f:
  #           pickle.dump(models[key],f)
  # if Path("Models/BB-MAS/{}/features.csv".format(user)).is_file() == False:
  #   pd.DataFrame.from_dict(k_best[user]).to_csv("Models/BB-MAS/{}/features.csv".format(user))
    if key == "SVM":
      SVM_optimize(X_train, y_train, path, models[key])
    elif key == "KNN":
      KNN_optimize(X_train, y_train, path, models[key])
    elif key == "RandomForest":
      RandomForest_optimize(X_train, y_train, path, models[key])
    elif key == "LR":
      LogisticRegression_optimize(X_train, y_train, path, models[key])
  clear_output()
#print(accuracy)
#del accuracy['user_id']

#pd.DataFrame.from_dict(accuracy).to_csv("BB-MAS-training-f1-accuracy.csv")

User 7
KNN


In [None]:

pd.DataFrame.from_dict(accuracy).to_csv("BB-MAS-training-balanced-accuracy.csv")

In [None]:
test_df = bb_mas_test_df
#sample_users = train_df["user_id"].unique()
sample_users = [7]

eval_accuracy = {key: [] for key in ["user_id"] + list(models.keys())}
eval_auc = {key: [] for key in ["user_id"] + list(models.keys())}
eval_far = {key: [] for key in ["user_id"] + list(models.keys())}
eval_frr = {key: [] for key in ["user_id"] + list(models.keys())}
eval_eer = {key: [] for key in ["user_id"] + list(models.keys())}
eval_hter = {key: [] for key in ["user_id"] + list(models.keys())}

for user in sample_users:
    print(user)
    model_path = "Models/Balabit/{}".format(user)
    if os.path.isdir(model_path) == True:
      features = pd.read_csv("Models/Balabit/{}".format(user) +'/features.csv')
      X_test, y_test = prep_feature_df(test_df, user)
      X_test = X_test[features["0"]]

      for key in models:
        print(key)
        with open(model_path + "/{}.pkl".format(key), 'rb') as f:
              loaded_model = pickle.load(f)
        predicted_y = loaded_model.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
        far = fp / (fp + tn)
        frr = fn / (fn + tp)
        hter = (far + frr) / 2

        # score = loaded_model.score(X_test, y_test)
        # auc = roc_auc_score(y_test, loaded_model.predict_proba(X_test)[:,1])
        # far, tpr, threshold = roc_curve(y_test, loaded_model.predict_proba(X_test)[:,1], pos_label=1)
        # frr = 1 - tpr
        # EER = far[np.nanargmin(np.absolute((frr - far)))]
        eval_hter[key].append(hter)
        # eval_auc[key].append(auc)
        eval_far[key].append(far)
        eval_frr[key].append(frr)
        
        # eval_eer[key].append(EER)
        # print("score: ", score)
        # print("auc: ", auc)
        print("far: ", far)
        print("frr: ", frr)
        print("hter: ", hter)
      
        


pd.DataFrame.from_dict(eval_hter).to_csv("Balabit_test_hter.csv")
# pd.DataFrame.from_dict(eval_auc).to_csv("BB-MAS_test_AUC.csv")
pd.DataFrame.from_dict(eval_far).to_csv("Balabit_test_far.csv")
pd.DataFrame.from_dict(eval_frr).to_csv("Balabit_test_frr.csv")
#pd.DataFrame.from_dict(eval_eer).to_csv("BB-MAS_test_eer.csv")
print(eval_accuracy)


In [None]:
print("accuracy: ", eval_accuracy)
print("auc: ", eval_auc)
print("eer: ", eval_eer)
# print("far: ", eval_far)
# print("frr: ", eval_frr)

eval_accuracy['user_id'] = sample_users
eval_auc['user_id'] = sample_users
eval_eer['user_id'] = sample_users
eval_far['user_id'] = sample_users
eval_frr['user_id'] = sample_users
print(eval_accuracy)

pd.DataFrame.from_dict(eval_accuracy).to_csv("BB-MAS_test_accuracy.csv")
pd.DataFrame.from_dict(eval_auc).to_csv("BB-MAS_test_AUC.csv")
pd.DataFrame.from_dict(eval_far).to_csv("BB-MAS_test_far.csv")
pd.DataFrame.from_dict(eval_frr).to_csv("BB-MAS_test_frr.csv")
pd.DataFrame.from_dict(eval_eer).to_csv("BB-MAS_test_eer.csv")

In [None]:

print(eval_accuracy['LR'])
print(eval_accuracy['GNB'])
print(eval_accuracy['Random Forest'])
print((eval_accuracy['Random Forest2']))
#print(len(eval_accuracy['LR']))
#print(len(eval_accuracy['LR']))
#print(len(eval_accuracy['LR']))
##del eval_accuracy['user_id']
#pd.DataFrame.from_dict(eval_accuracy).to_csv("test_eval_accuracy.csv")

In [None]:
features = pd.read_csv("test_eval_accuracy.csv")
x = list(range(0,99))

plt.xlabel("User")
plt.ylabel("Accuracy")
plt.figure(1)
y = features['Random Forest']
plt.bar(x,y)
plt.title('Random Forest Test Accuracy')
plt.figure(2)
y = features['Random Forest2']
plt.bar(x,y)
plt.title('Random Forest2 Test Accuracy')
plt.figure(3)
y = features['KNN-5']
plt.bar(x,y)
plt.title('KNN-5 Test Accuracy')
plt.figure(4)
y = features['KNN-3']
plt.bar(x,y)
plt.title('KNN-3 Test Accuracy')
plt.figure(5)
y = features['LR']
plt.bar(x,y)
plt.title('LR Test Accuracy')
plt.figure(6)
y = features['GNB']
plt.bar(x,y)
plt.title('GNB Test Accuracy')
plt.figure(7)
y = features['SVM']
plt.bar(x,y)
plt.title('SVM Test Accuracy')
plt.figure(8)
y = features['KNN-7']
plt.bar(x,y)
plt.title('KNN-7 Test Accuracy')
plt.figure(9)
y = features['KNN-9']
plt.bar(x,y)
plt.title('KNN-9 Test Accuracy')


In [None]:
for key in accuracy.keys():
  
  print(len(accuracy[key]))

In [None]:
print(accuracy)

In [None]:
acc_df = pd.DataFrame.from_dict(accuracy)
# acc_df.to_csv("")
print(acc_df)