In [1]:
from google.colab import drive

ROOT = "/content/drive"
print(ROOT)
drive.mount(ROOT)

/content/drive
Mounted at /content/drive


In [2]:
%cd drive/MyDrive/USyd/Lab/Projects/Grassmann-ADMM/

/content/drive/MyDrive/USyd/Lab/Projects/Grassmann-ADMM


## Test Federated PCA performance ##

In [3]:
import numpy as np
# V_k = np.load('Abnormaldetection_KDD_dim_9_std_client_20.npy')
# V_k = np.load('Abnormaldetection_KDD_dim_9_std_client_20_iter_100_learningrate_1e-05.npy')
V_k = np.load('Grassman_Abnormaldetection_KDD_dim_9_std_client_20_iter_1000_lr_0.0001_sub_0.1.npy')
V_k.shape

(33, 9)

In [4]:
# PCA transform with zero mean
def self_pca_transform_with_zero_mean(X_train, V_k):
  return (X_train).dot(V_k)
  
# PCA inverse transform with zero mean
def self_inverse_transform_with_zero_mean(X_pca, V_k):
  return (X_pca.dot(V_k.T))

In [5]:
'''
Import necessary libraries
'''

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
%matplotlib inline

In [6]:
'''
Get KDD dataset from original source and store as a csv file
'''
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
df = pd.read_csv(url, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [7]:
'''
Put names on columns of KDD dataset because KDD dataset do not have names in advance
'''
# Assign names for columns
df.columns= [ 'duration','protocol_type', 'service', 'flag', 'src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
'is_host_login', 'is_guest_login','cnt','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
'diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','outcome']

# Show the attributes for KDD dataset
print(df.describe())

            duration     src_bytes     dst_bytes           land  \
count  494021.000000  4.940210e+05  4.940210e+05  494021.000000   
mean       47.979302  3.025610e+03  8.685324e+02       0.000045   
std       707.746472  9.882181e+05  3.304000e+04       0.006673   
min         0.000000  0.000000e+00  0.000000e+00       0.000000   
25%         0.000000  4.500000e+01  0.000000e+00       0.000000   
50%         0.000000  5.200000e+02  0.000000e+00       0.000000   
75%         0.000000  1.032000e+03  0.000000e+00       0.000000   
max     58329.000000  6.933756e+08  5.155468e+06       1.000000   

       wrong_fragment         urgent            hot  num_failed_logins  \
count   494021.000000  494021.000000  494021.000000      494021.000000   
mean         0.006433       0.000014       0.034519           0.000152   
std          0.134805       0.005510       0.782103           0.015520   
min          0.000000       0.000000       0.000000           0.000000   
25%          0.000000     

In [8]:
# Choose categorical columns to remove because PCA show good performance with numeric data
columns_drop = ['num_outbound_cmds','is_host_login','protocol_type','service','flag','land', 'logged_in','is_guest_login']

In [9]:
# Check the number of feature before removing
df.columns.shape

(42,)

In [10]:
# Check the type of outcome. 'normal.' represents normal record, while the rest types illustrate attack methods
df['outcome'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [11]:
# Remove the categorical data
df.drop(columns_drop, axis=1, inplace=True)

In [12]:
# Check number of feature after removing
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,181,5450,0,0,0,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,239,486,0,0,0,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,235,1337,0,0,0,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,219,1337,0,0,0,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,217,2032,0,0,0,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [13]:
'''
Get all normal data
'''
df_normal = df[df['outcome']=='normal.']
df_normal.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,181,5450,0,0,0,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,239,486,0,0,0,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,235,1337,0,0,0,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,219,1337,0,0,0,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,217,2032,0,0,0,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [14]:
'''
Get all abnormal data 
'''
df_abnormal = df[df['outcome']!='normal.']
df_abnormal.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
744,184,1511,2957,0,0,3,0,2,1,0,...,3,1.0,0.0,1.0,0.67,0.0,0.0,0.0,0.0,buffer_overflow.
745,305,1735,2766,0,0,3,0,2,1,0,...,4,1.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,buffer_overflow.
4049,79,281,1301,0,0,2,0,1,1,0,...,10,1.0,0.0,1.0,0.3,0.0,0.0,0.0,0.1,loadmodule.
4113,25,269,2333,0,0,0,0,0,1,0,...,2,0.03,0.06,0.01,0.0,0.0,0.0,0.0,0.0,perl.
7601,0,0,0,0,0,0,0,0,0,0,...,6,1.0,0.0,0.2,0.33,1.0,0.83,0.0,0.0,neptune.


In [15]:
'''
Data preprocessing 
'''

MIN_MAX = 1
STANDAR = 2
ROBUST = 3

# Choose the preprocessing methods:
def prep_data(dataX, prep_type=1):

  change_dataX = dataX.copy()
  featuresToScale = change_dataX.columns

  if prep_type == MIN_MAX:
    min_max = MinMaxScaler()
    change_dataX.loc[:,featuresToScale] = min_max.fit_transform(change_dataX[featuresToScale])
  elif prep_type == STANDAR:
    sX = StandardScaler(copy=True)
    change_dataX.loc[:,featuresToScale] = sX.fit_transform(change_dataX[featuresToScale])
  else:
    robScaler = RobustScaler()
    change_dataX.loc[:,featuresToScale] = robScaler.fit_transform(change_dataX[featuresToScale])
  
  return change_dataX

In [16]:
# Define the score function for abnormal detection
def anomalyScores(originalDF, reducedDF):
  loss = np.sum((np.array(originalDF) - np.array(reducedDF))**2, axis=1) 
  return loss

In [17]:
''' 
Test PCA on 2000 normal data 
'''
# Get the last 10000 normal data
normal_data_2000 = df_normal.iloc[80000:90000].copy()
# Remove the label
normal_data_2000 = normal_data_2000.drop('outcome', axis=1)
# Standardization 
normal_data_2000 = prep_data(normal_data_2000, prep_type=STANDAR)
# Transform data using Federated PCA components
normal_data_pca = self_pca_transform_with_zero_mean(normal_data_2000, V_k)
normal_data_inverse = self_inverse_transform_with_zero_mean(normal_data_pca, V_k)
# Min max normalization before testing to keep balance between features
normal_data_2000 = prep_data(normal_data_2000, prep_type=MIN_MAX)
normal_data_inverse = prep_data(normal_data_inverse, prep_type=MIN_MAX)
# Get anomalous score
abnormal_score = anomalyScores(normal_data_2000, normal_data_inverse)
print(abnormal_score.shape)
np.sum(abnormal_score)

(10000,)


47095.061615747436

In [18]:
index = abnormal_score > 6
len(abnormal_score[index])

33

In [19]:
abnormal_score.max()

13.16078508836941

In [20]:
''' 
Test PCA on 2000 smurf attack data 
'''
smurf_data_2000 = df_abnormal[df_abnormal['outcome']=='smurf.'].iloc[:2000].copy()
smurf_data_2000 = smurf_data_2000.drop('outcome', axis=1)
smurf_data_2000 = prep_data(smurf_data_2000, prep_type=STANDAR)
smurf_data_pca = self_pca_transform_with_zero_mean(smurf_data_2000, V_k)
smurf_data_inverse = self_inverse_transform_with_zero_mean(smurf_data_pca, V_k)
smurf_data_2000 = prep_data(smurf_data_2000, prep_type=MIN_MAX)
smurf_data_inverse = prep_data(smurf_data_inverse, prep_type=MIN_MAX)
abnormal_score = anomalyScores(smurf_data_2000, smurf_data_inverse)
np.sum(abnormal_score)

31993.712641488277

In [21]:
rand_smurf = df_abnormal[df_abnormal['outcome']=='smurf.'].sample(random_state=93).copy()
rand_smurf

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
96634,0,1032,0,0,0,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.


In [22]:
def get_abnormaly_scores_on_all_attack(V_k, prep_type=MIN_MAX):
  for name in df_abnormal['outcome'].unique():
    dataX = df_abnormal[df_abnormal['outcome']==name]
    if dataX.shape[0] == 0:
      continue
    if dataX.shape[0] > 2000:
      dataX = dataX.iloc[:2000]
      # dataX = dataX
    dataX = dataX.drop('outcome', axis=1)
    dataX = prep_data(dataX, prep_type=STANDAR)
    data_transform = self_pca_transform_with_zero_mean(dataX, V_k)
    data_inverse = self_inverse_transform_with_zero_mean(data_transform, V_k)
    dataX = prep_data(dataX, prep_type=MIN_MAX)
    data_inverse = prep_data(data_inverse, prep_type=MIN_MAX)
    abnormal_score = anomalyScores(dataX, data_inverse)
    # print(f"abnormal score on {name} with {dataX.shape[0]} samples: {np.sum(abnormal_score)} with min val: {abnormal_score.min()}")
    index = abnormal_score < 6
    # print(len(abnormal_score[index]))

In [23]:
num_samples = get_abnormaly_scores_on_all_attack(V_k, prep_type=STANDAR)
num_samples

In [24]:
'''
Function test on each type of attack
'''
def test_on_random_attack_samples(df_abnormal, V_k, fraction, thres_hold=6):
  num_samples = 0
  total_misclassified_samples = 0
  abnormal_indices_test = []
  for name in df_abnormal['outcome'].unique():
    # Randomly get a portion of attack dataset to test
    dataX = df_abnormal[df_abnormal['outcome']==name].sample(frac=fraction, random_state=1).copy()
    if dataX.shape[0] == 0:
      continue
    # Gather random samples for testing on other algorithms
    dataX_indices = dataX.index.to_list()
    # Concatenate 2 lists of indices
    abnormal_indices_test += dataX_indices
    # Remove the label
    dataX = dataX.drop('outcome', axis=1)
    # Standardization for testing data
    dataX = prep_data(dataX, prep_type=STANDAR)
    # Transform using Federated PCA components
    data_transform = self_pca_transform_with_zero_mean(dataX, V_k)
    data_inverse = self_inverse_transform_with_zero_mean(data_transform, V_k)
    # Min max normalization before testing to keep balance between features
    dataX = prep_data(dataX, prep_type=MIN_MAX)
    data_inverse = prep_data(data_inverse, prep_type=MIN_MAX)
    # Get anomalous score on data
    abnormal_score = anomalyScores(dataX, data_inverse)
    # print(f"abnormal score on {name} with {dataX.shape[0]} samples: {np.sum(abnormal_score)} with min val: {abnormal_score.min()}")
    index = abnormal_score < thres_hold
    # print(len(abnormal_score[index]))
    misclassified_sample_on_attack = len(abnormal_score[index])
    total_misclassified_samples += misclassified_sample_on_attack
    num_samples += dataX.shape[0]
  '''The following code section below should only run for the first time to create test data set'''
  # # Store test and train data for other algorithms
  # abnormal_test_data = df_abnormal.loc[abnormal_indices_test].copy()
  # abnormal_test_data.to_csv('abnormal_detection_data/test/abnormal_test.csv')
  # print(abnormal_test_data.shape)
  # abnormal_train_data = df_abnormal.copy()
  # abnormal_train_data = abnormal_train_data.drop(index=abnormal_indices_test)
  # abnormal_train_data.to_csv('abnormal_detection_data/test/abnormal_train.csv')
  # print(abnormal_train_data.shape)
  return  num_samples, total_misclassified_samples

In [25]:
num_samples, misclassified_samples_on_attack = test_on_random_attack_samples(df_abnormal, V_k, fraction=0.02, thres_hold=6)
num_samples, misclassified_samples_on_attack

(7934, 454)

In [26]:
'''
Function test on all types of attacks
'''
def get_abnormaly_scores_on_all_abnormal(V_k, thres_hold = 6,prep_type=MIN_MAX):
  dataX = pd.read_csv('abnormal_detection_data/test/abnormal_test.csv', index_col=False)
  dataX = dataX.drop(['outcome', 'Unnamed: 0'], axis=1)
  # print(dataX.head(10))
  dataX = prep_data(dataX, prep_type=STANDAR)
  data_transform = self_pca_transform_with_zero_mean(dataX, V_k)
  data_inverse = self_inverse_transform_with_zero_mean(data_transform, V_k)
  dataX = prep_data(dataX, prep_type=MIN_MAX)
  data_inverse = prep_data(data_inverse, prep_type=MIN_MAX)
  abnormal_score = anomalyScores(dataX, data_inverse)
  # print(f"abnormal score on abnormal data with {dataX.shape[0]} samples: {np.sum(abnormal_score)} with min val: {abnormal_score.min()}")
  index = abnormal_score < thres_hold
  # print(len(abnormal_score[index]))
  num_samples = dataX.shape[0]
  total_misclassified_samples = len(abnormal_score[index])
  return num_samples, total_misclassified_samples

In [27]:
num_samples, misclassified_samples_on_attack = get_abnormaly_scores_on_all_abnormal(V_k, thres_hold=6)
num_samples, misclassified_samples_on_attack

(7934, 454)

In [28]:
test_abnormal = df_abnormal.head().copy()
random_abnormal = test_abnormal.sample(n=3)
indices = random_abnormal.index
print(indices)
the_rest = test_abnormal.drop(index=indices)
the_rest_indices = the_rest.index.to_list()
print(the_rest_indices)
df_abnormal.shape[0] - 7934

Int64Index([4049, 744, 745], dtype='int64')
[4113, 7601]


388809

In [29]:
'''
Test on another 10000 sample of normal data
'''
def test_on_normal_data_samples(df_normal, V_k, thres_hold=6):
  # Get the last 10000 normal data
  normal_data_10000 = df_normal.iloc[80000:90000].copy()
  # Remove the label
  normal_data_10000 = normal_data_10000.drop('outcome', axis=1)
  # Standardization 
  normal_data_10000 = prep_data(normal_data_10000, prep_type=STANDAR)
  # Transform data using Federated PCA components
  normal_data_pca = self_pca_transform_with_zero_mean(normal_data_10000, V_k)
  normal_data_inverse = self_inverse_transform_with_zero_mean(normal_data_pca, V_k)
  # Min max normalization before testing to keep balance between features
  normal_data_10000 = prep_data(normal_data_10000, prep_type=MIN_MAX)
  normal_data_inverse = prep_data(normal_data_inverse, prep_type=MIN_MAX)
  # Get anomalous score
  abnormal_score = anomalyScores(normal_data_10000, normal_data_inverse)
  # Get the misclassified samples
  index = abnormal_score > thres_hold
  total_normal_samples = normal_data_10000.shape[0]
  return total_normal_samples, len(abnormal_score[index])

In [30]:
total_normal_samples, misclassified_samples = test_on_normal_data_samples(df_normal, V_k)
total_normal_samples, misclassified_samples

(10000, 33)

In [31]:
def kdd_test(df_normal, df_abnormal, V_k, fraction, thres_hold):
  # Get total samples and misclassified samples on normal data
  normal_total_samples, normal_mis_samples = test_on_normal_data_samples(df_normal, V_k, thres_hold=thres_hold)
  FN = normal_mis_samples
  TN = normal_total_samples - normal_mis_samples
  # Get total samples and misclassified samples on abnormal data
  abnormal_total_samples, abnormal_mis_samples = test_on_random_attack_samples(df_abnormal, V_k, fraction = 0.02, thres_hold=thres_hold)
  FP = abnormal_mis_samples
  TP = abnormal_total_samples - abnormal_mis_samples
  # Accuracy on test set
  acc = (1 - (normal_mis_samples + abnormal_mis_samples)*2/(normal_total_samples + abnormal_total_samples))*100
  # print(f"Accuracy on {normal_total_samples} normal samples and {abnormal_total_samples} abnormal samples : {acc}")
  precision_score = TP/(FP + TP)
  recall_score = TP/(FN + TP)
  accuracy_score = (TP + TN)/ (TP + FN + TN + FP)
  f1_score = 2*precision_score*recall_score/(precision_score + recall_score)
  print(f"Precision: {precision_score * 100.0}")
  print(f"Recall: {recall_score * 100.0}")
  print(f"Accuracy score: {accuracy_score * 100.0}")
  print(f"F1 score: {f1_score * 100.0}")

In [32]:
kdd_test(df_normal, df_abnormal, V_k, fraction=0.02, thres_hold=6)

Precision: 94.27779178220318
Recall: 99.56076134699853
Accuracy score: 97.284487565518
F1 score: 96.84728426231631


## Test centralized PCA performance ##

In [33]:
# Define the PCA
from pandas.core.common import random_state
from sklearn.decomposition import PCA

def perform_pca(dataX):
  pca = PCA(0.99)

  X_train_PCA = pca.fit_transform(dataX)
  X_train_PCA = pd.DataFrame(data=X_train_PCA, index=dataX.index)

  X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
  X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=dataX.index)

  anomalyScoresPCA = anomalyScores(dataX, X_train_PCA_inverse)
  return pca, anomalyScoresPCA

# PCA transform
def self_pca_transform(X_train, pca):
  return (X_train).dot(pca.components_.T)
  
# PCA inverse transform
def self_inverse_transform(X_pca, pca):
  return (X_pca.dot(pca.components_))

In [34]:
# Get data and labels
dataX = df_normal.drop(['outcome'], axis=1).copy()
dataY = df_normal['outcome'].copy()

In [35]:
'''
Train pca with normal data
'''
train_dataX = prep_data(dataX, prep_type=STANDAR)
pca, abnormalyScore = perform_pca(train_dataX[:80000])

In [36]:
''' 
Test PCA on 2000 normal data 
'''
normal_data_2000 = df_normal.iloc[80000:90000].copy()
normal_data_2000 = normal_data_2000.drop('outcome', axis=1)
normal_data_2000 = prep_data(normal_data_2000, prep_type=STANDAR)
normal_data_pca = self_pca_transform(normal_data_2000, pca)
normal_data_inverse = self_inverse_transform(normal_data_pca, pca)
normal_data_2000 = prep_data(normal_data_2000, prep_type=MIN_MAX)
normal_data_inverse = prep_data(normal_data_inverse, prep_type=MIN_MAX)
abnormal_score = anomalyScores(normal_data_2000, normal_data_inverse)
# np.sum(abnormal_score)

In [37]:
centralized_V_k = pca.components_.T

In [38]:
kdd_test(df_normal, df_abnormal, centralized_V_k, fraction=0.02, thres_hold=5.5)

Precision: 95.58860599949584
Recall: 100.0
Accuracy score: 98.04839968774395
F1 score: 97.74455471065859


## Test Standalone PCA performance ##

In [39]:
# Get data and labels
dataX = df_normal.drop(['outcome'], axis=1).copy()
dataY = df_normal['outcome'].copy()

In [40]:
'''
Train pca with normal data
'''
train_dataX = prep_data(dataX, prep_type=STANDAR)
pca, abnormalyScore = perform_pca(train_dataX[:4000])

In [41]:
''' 
Test PCA on 2000 normal data 
'''
normal_data_2000 = df_normal.iloc[80000:90000].copy()
normal_data_2000 = normal_data_2000.drop('outcome', axis=1)
normal_data_2000 = prep_data(normal_data_2000, prep_type=STANDAR)
normal_data_pca = self_pca_transform(normal_data_2000, pca)
normal_data_inverse = self_inverse_transform(normal_data_pca, pca)
normal_data_2000 = prep_data(normal_data_2000, prep_type=MIN_MAX)
normal_data_inverse = prep_data(normal_data_inverse, prep_type=MIN_MAX)
abnormal_score = anomalyScores(normal_data_2000, normal_data_inverse)
# np.sum(abnormal_score)

In [42]:
standalone_V_k = pca.components_.T

In [44]:
kdd_test(df_normal, df_abnormal, standalone_V_k, fraction=0.02, thres_hold=5)

Precision: 77.53970254600453
Recall: 99.98374776531773
Accuracy score: 90.05799040927846
F1 score: 87.34294029956698
