# CP_APR Testing

We're going to play with a toy example and then try to produce our own example to test with.

In [20]:
from pyCP_APR import CP_APR

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import os
import os.path
import gzip
import shutil
import datetime
import networkx as nx
import pickle
import pyclustering
from scipy import stats
from scipy import sparse
import seaborn as sns
import bz2
import json
import random
random.seed(1134)

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor

from IPython.display import clear_output

In [4]:
# creates a list of train and test co-ordinates, along with train and test counts
# the co-ordinates correspond to a sparse matrix which is 0 in most places apart from the locations where we have co-ordinates which correspond to a value

train_coords = list()
train_count = list()

test_coords = list()
test_count = list()

for ii in range(1000):
    train_coords.append(list(np.random.randint(10, size=4)))
    train_count.append(random.randint(1, 20))
    
for ii in range(100):
    test_coords.append(list(np.random.randint(10, size=4)))
    test_count.append(random.randint(1, 20))

In [5]:
# we save a dictionary with our co-ordinates and values in

tensor = dict()
tensor['train_coords'] = train_coords
tensor['train_count'] = train_count
tensor['test_coords'] = test_coords
tensor['test_count'] = test_count

In [6]:
np.savez_compressed('TOY',**tensor)
data = np.load("TOY" + str(".npz"), allow_pickle=True)

In [7]:
# Training set
coords_train = data['train_coords']
nnz_train = data['train_count']

# Test set
coords_test = data['test_coords']
nnz_test = data['test_count']

In [8]:
# our train co-ordinates - 4D matrix co-ordinates
coords_train[:10]

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7],
       [7, 2, 5, 4],
       [1, 7, 5, 1],
       [4, 0, 9, 5],
       [8, 0, 9, 2],
       [6, 3, 8, 2],
       [4, 2, 6, 4],
       [8, 6, 1, 3]])

In [9]:
# non zero entries of our 4D matrix
nnz_train[:10]

array([ 4,  1,  9,  8,  8,  5,  4, 18,  3, 19])

In [28]:
coords_test[:10]

array([[1, 4, 1, 1],
       [5, 9, 8, 3],
       [5, 6, 5, 7],
       [3, 0, 6, 2],
       [5, 1, 8, 0],
       [5, 5, 4, 4],
       [0, 9, 4, 1],
       [6, 2, 3, 2],
       [2, 3, 4, 0],
       [3, 7, 9, 6],
       [8, 4, 9, 3],
       [7, 8, 6, 6],
       [4, 3, 9, 0],
       [9, 9, 6, 0],
       [0, 4, 8, 5],
       [2, 2, 6, 5],
       [8, 5, 4, 9],
       [6, 0, 6, 2],
       [8, 5, 8, 4],
       [6, 3, 6, 0],
       [6, 7, 4, 1],
       [4, 5, 1, 5],
       [5, 0, 0, 9],
       [2, 9, 9, 2],
       [3, 1, 8, 9],
       [6, 6, 5, 2],
       [5, 7, 7, 8],
       [0, 0, 2, 2],
       [9, 7, 9, 0],
       [1, 1, 2, 5],
       [0, 1, 0, 4],
       [6, 7, 9, 0],
       [6, 4, 9, 0],
       [2, 6, 9, 0],
       [6, 4, 0, 0],
       [0, 8, 2, 4],
       [9, 8, 5, 8],
       [7, 7, 0, 8],
       [5, 8, 2, 2],
       [5, 8, 6, 2],
       [2, 1, 8, 2],
       [4, 8, 7, 1],
       [2, 1, 7, 5],
       [5, 8, 7, 3],
       [3, 8, 0, 1],
       [1, 5, 3, 9],
       [2, 2, 2, 5],
       [7, 6,

In [11]:
nnz_test[:10]

array([10, 10, 19, 19, 16,  5, 15, 18, 16, 12])

In [12]:
# here we create our model

cp_apr = CP_APR(n_iters=10, random_state=42, verbose=200, 
                method='numpy',
                return_type='numpy', 
               )

In [13]:
# this is fitting our model - we pass the non-zero co-ordinates and values
factors = cp_apr.fit(coords=coords_train, values=nnz_train, rank=[1,5])
factors

CP-APR (MU):
Iter=1, Inner Iter=8, KKT Violation=0.000000, obj=-9748.910692, nViolations=0
Exiting because all subproblems reached KKT tol.
 Final log-likelihood = -9748.910692
 Final least squares fit = 0.041186
 Final KKT violation = 0.000000
 Total inner iterations = 12
 Total execution time = 0.0030 seconds
Converting the latent factors to Numpy arrays.
CP-APR (MU):
Iter=1, Inner Iter=48, KKT Violation=0.361464, obj=-8970.856452, nViolations=0
 Final log-likelihood = -8013.265798
 Final least squares fit = 0.056443
 Final KKT violation = 0.125601
 Total inner iterations = 412
 Total execution time = 0.0464 seconds
Converting the latent factors to Numpy arrays.


[{'Factors': {'0': array([0.10385419, 0.10347446, 0.10005696, 0.09711411, 0.09407632,
          0.09379153, 0.11363205, 0.08800076, 0.11638504, 0.08961458]),
   '1': array([0.09056389, 0.10632238, 0.0995823 , 0.10091133, 0.11334726,
          0.09588001, 0.09720904, 0.11704955, 0.10015189, 0.07898234]),
   '2': array([0.10992975, 0.10613252, 0.09758876, 0.11866338, 0.09796848,
          0.09625973, 0.07898234, 0.09350674, 0.10176571, 0.09920258]),
   '3': array([0.10992975, 0.07442567, 0.11714448, 0.09103854, 0.11334726,
          0.09815834, 0.09711411, 0.10461363, 0.10727169, 0.08695652])},
  'Weights': array([10534.])},
 {'Factors': {'0': array([[7.71488774e-02, 2.54918156e-01, 6.88068733e-02, 3.29047079e-03,
           2.51335302e-11],
          [8.34881700e-02, 4.88943243e-02, 1.98652728e-01, 1.86950067e-01,
           1.61173697e-06],
          [9.09859045e-02, 2.54992936e-02, 1.16819069e-01, 2.60339839e-01,
           1.00834862e-01],
          [1.06771051e-01, 4.56575814e-02, 1

In [14]:
# here we make our predictions
y_score = cp_apr.predict_scores(coords=coords_test, values=nnz_test)
y_score

array([1.31886094e-07, 2.26864033e-06, 0.00000000e+00, 0.00000000e+00,
       5.64723823e-11, 1.05136494e-04, 6.40598685e-14, 6.68931577e-12,
       5.55111512e-16, 1.67432734e-12, 1.59183907e-08, 0.00000000e+00,
       5.19806420e-13, 7.02117253e-11, 2.76715317e-11, 4.28549482e-05,
       3.84667144e-05, 6.53639648e-06, 2.78443935e-13, 3.33066907e-16,
       1.80025854e-05, 2.00896078e-10, 2.37112144e-01, 5.12156983e-12,
       0.00000000e+00, 3.84396781e-09, 9.24726962e-12, 2.12946753e-02,
       2.02282635e-13, 9.37565072e-02, 2.36495301e-03, 2.31732034e-09,
       5.29565281e-12, 5.28403743e-09, 1.74232083e-01, 6.17284002e-14,
       6.40856340e-03, 6.66133815e-16, 3.97459843e-14, 7.65001936e-01,
       9.70373790e-04, 2.65343303e-14, 1.24084621e-01, 2.47718786e-02,
       0.00000000e+00, 1.05250429e-07, 3.90421029e-12, 0.00000000e+00,
       3.86737309e-11, 1.20131874e-01, 7.87037102e-13, 0.00000000e+00,
       4.12089599e-08, 1.38916878e-10, 1.48880908e-13, 0.00000000e+00,
      

### Our Example

Here we create sparse matrices of our data, representing the authentication types over each hour of each day.

In [17]:
try:
    print('Attempting to read entire data set.')
    authentication_data = pd.read_csv('../Data/Authentication data.gz', compression='gzip', index_col = 0)
    process_data = pd.read_csv('../Data/Process data.gz', compression='gzip', index_col = 0)
except:
    clear_output()
    print('Unable to read entire data set, reading from original files.')
    rootdir = 'C:/Users/corri/OneDrive/Documents/Uni/Postgraduate/Final Project/LANL/ATI Data/Summaries/wls'
    unzippeddir = 'C:/Users/corri/OneDrive/Documents/Uni/Postgraduate/Final Project/LANL/ATI Data/Summaries/wls/Unzipped'
    frames = []

    count = 0

    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file[-3:] == '.gz':
                filedir = rootdir + '/' + file
                with gzip.open(filedir) as f:
                    df = pd.read_csv(filedir, header=None)
                    frames.append(df)
                if 'authentications' in str(file):
                    count = count + len(df)

    df = pd.concat(frames)

    authentication_data = df[:count]
    authentication_data.columns = ['UserName', 'SrcDevice','DstDevice', 'Authent Type', 'Failure', 'DailyCount']

    process_data = df[count:]
    process_data = process_data[[0,1,2,3,4]]
    process_data.columns = ['UserName', 'Device', 'ProcessName', 'ParentProcessName', 'DailyCount']

    authentication_data.to_csv('../Data/Authentication data.gz', header=True, compression='gzip')
    process_data.to_csv('../Data/Process data.gz', header=True, compression='gzip')

Attempting to read entire data set.


  mask |= (ar1 == a)


In [18]:
index_list = process_data.index.tolist()
proc_start_days = [i for i, e in enumerate(index_list) if e == 0]
proc_start_days.append(len(process_data))

auth_index_list = authentication_data.index.tolist()
auth_start_days = [i for i, e in enumerate(auth_index_list) if e == 0]
auth_start_days.append(len(authentication_data))

In [19]:
# splits a dataframe into n chunks
def split_dataframe(df,n): 
    chunks = list()
    chunk_size = int(np.round(df.shape[0]/n))
    num_chunks = n
    for i in range(num_chunks):
        if i != num_chunks-1:
            chunks.append(df[i*chunk_size:(i+1)*chunk_size])
        else:
            chunks.append(df[i*chunk_size:])
    return chunks

In [21]:
def auth_type_un_sparse(user,n,e):
    auth_type_df = pd.DataFrame(index = list(authentication_data['Authent Type'].unique()))
    n = n
    auth_type_dict = {}

    for i in range(len(auth_start_days)-1):
        chunks = split_dataframe(authentication_data[auth_start_days[i]:auth_start_days[i+1]],n)
        for j in range(n):
                data = chunks[j]
                auth_type_data = data[data['UserName'] == user].groupby('Authent Type').size()
                auth_type_dict[i*n + j] = auth_type_df.index.to_series().map(auth_type_data.to_dict())
    
    auth_type_df = pd.DataFrame(data=auth_type_dict,index = list(authentication_data['Authent Type'].unique()))
    auth_type_df = auth_type_df.transpose()
    auth_type_df = auth_type_df.fillna(0)
    
    s = sparse.coo_matrix(auth_type_df)
    
    co = []
    vals = s.data
    for i in range(len(s.row)):
        co.append([s.row[i],s.col[i],e])

    
    return vals, co

In [22]:
usernames = list(authentication_data['UserName'].unique())

In [58]:
len(usernames)

28815

In [24]:
s_time = datetime.datetime.now()

vals, co = auth_type_un_sparse(usernames[0],24,1)

e_time = datetime.datetime.now()

print(e_time-s_time)

0:00:05.173146


In [53]:
s_time = datetime.datetime.now()

train_coords, train_vals = [], []
test_coords, test_vals = [], []

for e, un in enumerate(usernames):
    if e <= 199:
        vals, co = auth_type_un_sparse(un,24,e)
        train_coords.append(co)
        train_vals.append(vals)
    elif e <= 219:
        vals, co = auth_type_un_sparse(un,24,e-200)
        test_coords.append(co)
        test_vals.append(vals)

        
train_coords = np.array([item for sublist in train_coords for item in sublist])
test_coords = np.array([item for sublist in test_coords for item in sublist])

train_vals = np.array([item for sublist in train_vals for item in sublist])
test_vals = np.array([item for sublist in test_vals for item in sublist])

e_time = datetime.datetime.now()

print(e_time-s_time)

0:14:52.999143


In [59]:
len(train_coords)

198375

In [54]:
cp_apr_t = CP_APR(n_iters=10, random_state=42, verbose=200, method='numpy', return_type='numpy')

In [55]:
factors_t = cp_apr.fit(coords=train_coords, values=train_vals)
factors_t

CP-APR (MU):
Iter=1, Inner Iter=138, KKT Violation=0.476140, obj=2055564.083200, nViolations=0
 Final log-likelihood = 2147125.718336
 Final least squares fit = 0.898651
 Final KKT violation = 0.210652
 Total inner iterations = 1221
 Total execution time = 2.6504 seconds
Converting the latent factors to Numpy arrays.


{'Factors': {'0': array([[4.92893590e-04, 1.54202644e-03],
         [4.42246858e-04, 5.93090362e-04],
         [4.76830540e-04, 6.07807661e-04],
         ...,
         [3.10619357e-05, 2.31081633e-04],
         [2.75302492e-05, 1.69470410e-04],
         [3.38502079e-05, 2.42015511e-04]]),
  '1': array([[7.27497572e-036, 5.37763732e-001],
         [9.99761337e-001, 1.76637256e-001],
         [2.33596426e-005, 1.26345355e-001],
         [4.98627215e-108, 3.85736137e-002],
         [1.97116483e-108, 4.96965530e-002],
         [1.85695518e-110, 1.38283763e-002],
         [2.15303717e-004, 2.42407250e-002],
         [2.03712544e-109, 1.41242835e-002],
         [8.20167096e-075, 1.08376002e-002],
         [5.43691904e-075, 2.42538201e-003],
         [2.21770512e-110, 1.42669530e-003],
         [5.30500102e-097, 4.10042798e-003]]),
  '2': array([[1.56445343e-004, 5.13086990e-003],
         [6.96462679e-005, 2.57051905e-003],
         [2.45194908e-004, 7.97120022e-003],
         [1.39873911e-0

In [56]:
p_values = cp_apr.predict_scores(coords=test_coords, values=test_vals)
p_values

array([0.27490744, 0.2748005 , 0.14184473, ..., 0.03414956, 0.06444011,
       0.05569363])

In [61]:
len(np.where(p_values < 0.05)[0])

5418

In [62]:
len(test_vals)

19644

### Returning the original data point

In [68]:
def auth_type_un_df(user,n):
    auth_type_df = pd.DataFrame(index = list(authentication_data['Authent Type'].unique()))
    n = n
    auth_type_dict = {}
    
    for i in range(len(auth_start_days)-1):
        chunks = split_dataframe(authentication_data[auth_start_days[i]:auth_start_days[i+1]],n)
        for j in range(n):
                data = chunks[j]
                auth_type_data = data[data['UserName'] == user].groupby('Authent Type').size()
                auth_type_dict[i*n + j] = auth_type_df.index.to_series().map(auth_type_data.to_dict())
    
    auth_type_df = pd.DataFrame(data=auth_type_dict,index = list(authentication_data['Authent Type'].unique()))
    auth_type_df = auth_type_df.transpose()
    auth_type_df = auth_type_df.fillna(0)
    
    return auth_type_df

In [93]:
np.where(p_values < 0.05)[0][1982]

8223

In [113]:
a_t = list(authentication_data['Authent Type'].unique())
AT_dict = { i : a_t[i] for i in range(0, len(a_t) ) }

In [116]:
AT_dict[0]

'TGS'

In [124]:
# needs chunks function, usernames list, authentication data, authentication_start_days list, test-coords, authentication_dictionary

def orig_finder(entry_val,n):
    
    # gets the co-ordinates of the entry where we have the erro
    orig_co = test_coords[entry_val]
    
    # gets the authentication type
    authent = AT_dict[orig_co[1]]
    
    # gets the username of the individual who the anomaly occured with
    username = usernames[200+orig_co[2]]
    
    # gets the day the anomaly occured (n is the number of hours we split the data frame into)
    day = int(orig_co[0]/n)
    
    # gets the hour the anomaly occured in
    hour = orig_co[0] - n * day
    
    # gets the n hour chunks for that day
    chunks = split_dataframe(authentication_data[auth_start_days[day]:auth_start_days[day+1]],n)
    
    # gets the hour
    data = chunks[hour]
    
    # finds the anomaly
    anom = data[(data['UserName'] == username) & (data['Authent Type'] == authent)]
    
    return anom

In [172]:
frames = []
for i in range(2):
    df = orig_finder(8223+i,24)
    frames.append(df)
    
pd.concat(frames)

Unnamed: 0,UserName,SrcDevice,DstDevice,Authent Type,Failure,DailyCount
154489,User148020,Comp691505,,ScreensaverDismissed,0,6.0
155821,User148020,Comp691505,Comp370444,TGS,0,8.0
