# Anomaly Detection - Distance based Methods 

## 1. k-Nearest Neighbor-based Anomaly Detection 

1. 다양한 dist function에 따른 거리 값을 산출한 n x n Matrix를 만든다. 

2. 각 dist에 의거한 K-Nearest Neighbor을 구한다. 

3. k-Nearest의 반지름이 클수록 Abnormal data이다. 

**구현해야하는 것**
- $d_{max}^k$ 
- $d_{avg}^k$
- $d_{mean}^k$
- $d_{c-hull}^k$ 
- $d_{hybrid}^k$ 

**필요한 것**
- dist_matrix
- k_matrix
- w_matrix
- X 
- k 
- dist_type 

##### 기존 알고리즘 구현에서 필요한 구조를 따와서 구현


In [1]:
import numpy as np
import pandas as pd
import random as rand

from sklearn.datasets import load_iris
X = load_iris()['data']

import matplotlib.pyplot as plt
import scipy as sc
from scipy.stats import norm
from sys import maxsize

In [75]:
class K_Nearest(): 
    def __init__(self,X, k, dist_type) : 
        self.X = np.array(X)
        self.n = np.shape(X)[0]
        self.m = np.shape(X)[1] 
        
        self.k = k
        self.dist_type = dist_type
        self.dist = self.dist_matrix()
        self.k_dist, self.k_value, self.nkp_list = self.cal_k_dist()
        self.w = self.cal_w()
        self.dist_matrix = self.cal_dist() 
    
    
    def cal_w(self) : 
        C = self.dist **2 
        inv_C = np.reciprocal(C)
        for i in range(self.n) : 
            inv_C[i,i] = 0 

        w = np.zeros(shape = (self.n, self.n))
        for i in range(self.n) : 
            for j in range(self.n) : 
                up, down = 0,0
                for index in self.nkp_list[i] : 
                    if j in self.nkp_list[i] : up += inv_C[j, index] 
                    down += np.sum(inv_C[index, self.nkp_list[i]])
                w[i,j] = up / down 
        return w
    
    def dist_matrix(self) :
        dist = []
        for i in range(self.n) : 
            vector = [np.linalg.norm(self.X[i] - self.X[j]) for j in range(self.n)]
            dist.append(vector)
        return np.array(dist)

    def cal_k_dist(self) : 
        k_dist_lst = [] 
        k_value = [] 
        nkp_list = []
        for i in range(self.n) : 
            index = np.argsort(self.dist[i])
            k_dist = self.dist[i][index][self.k]
            vector = [self.dist[i,j] if self.dist[i,j] <= k_dist else 0 for j in range(self.n)]
            nkp_vector = [] 
            for j in range(self.n) : 
                if self.dist[i,j] <= k_dist : nkp_vector.append(j)
            
            k_dist_lst.append(vector)
            k_value.append(k_dist)
            nkp_list.append(nkp_vector)
        return np.array(k_dist_lst), np.array(k_value), np.array(nkp_list)
    
    def cal_dist(self) : 
        if self.dist_type == "max" : 
            d_value = [np.max(self.k_dist[i]) for i in range(self.n)]
            
        elif self.dist_type == "avg" : 
            d_value = [np.sum(self.k_dist[i]) / len(self.nkp_list[i]) for i in range(self.n)]
        elif self.dist_type =="mean" : 
            d_value = [np.linalg.norm(self.X[i] - np.mean(self.X[self.nkp_list[i]], axis=0)) for i in range(self.n)]
            
        elif self.dist_type == "hybrid" : 
            avg_value = [np.sum(self.k_dist[i]) / len(self.nkp_list[i]) for i in range(self.n)]
            hull_value = [np.linalg.norm(self.X[i] - np.sum(self.X[self.nkp_list[i]] * self.w[i][self.nkp_list[i]].reshape(-1,1), axis=0)) for i in range(self.n)]
            d_value = np.array(avg_value) * np.array(2/(1+np.exp(-np.array(hull_value))))
        
        return d_value

    def check_abnormal(self, x): 
        index = np.where(self.X == x)[0][0]
        return print("k-distance is ", self.dist_matrix[index])
            
    
        
    
            
            

In [77]:
test = K_Nearest(X,3, "hybrid")
test.dist_matrix

  return np.array(k_dist_lst), np.array(k_value), np.array(nkp_list)
  inv_C = np.reciprocal(C)
  w[i,j] = up / down


array([0.09755638, 0.10895206, 0.20801877, 0.1407447 , 0.11831695,
       0.29805702, 0.21343315, 0.10685024, 0.20198277, 0.11577844,
       0.17692329, 0.19197878, 0.13482632, 0.25323699, 0.41160508,
       0.50671475, 0.31440539, 0.10723416, 0.35200615, 0.13566085,
       0.26508603, 0.16611552, 0.43669985, 0.22605467, 0.30554465,
       0.16058929, 0.16525021, 0.11065594, 0.11065594, 0.13709038,
       0.11860496, 0.24466515, 0.30100621, 0.28675063, 0.09755638,
       0.22564683, 0.25822064, 0.17518621, 0.14824344, 0.10776921,
       0.12893012, 0.68855345, 0.19551468, 0.21512154, 0.32399182,
       0.14337482, 0.16747121, 0.1283803 , 0.14342294, 0.14035053,
       0.28249258, 0.25522214, 0.22706042, 0.22334392, 0.25395191,
       0.24681469, 0.28786933, 0.25898637, 0.21973055, 0.41977401,
       0.43740464, 0.27719001, 0.47267166, 0.15829436, 0.3861381 ,
       0.2047033 , 0.2340468 , 0.23739286, 0.35585884, 0.1812976 ,
       0.24391076, 0.29612684, 0.31813249, 0.25438436, 0.21652

## Clustering-based Approach 

#### 강의 속 수도코드 
1. Select K points as the initial centroid 

2. repeat
- Form K clusters by assigning all points to the closest centroid.
- Recompute the centroid of each cluster 
- until the centroids don't change 

**구현해야하는 것**
- 최초 k point 선정
- 각자 데이터가 가장 가까운 k point에 대해 속하도록 설정 
- 각 point 별 cluster 들의 Centroid 계산 

**필요한 값**
- X 
- k 

**구현해야하는 함수**
- def __init__(self,X,k) : 

- def ini_k_point(self) : 

- def cluster(self) : 

- def find_centroid(self) : 



In [105]:
from collections import defaultdict

class Cluster() :  
    def __init__(self, X,k) : 
        self.X = X
        self.n = np.shape(X)[0]
        self.m = np.shape(X)[1] 
        
        self.k = k 
        self.k_point = self.ini_k_point()
        self.cluster = self.form_cluster() 
        
    def ini_k_point(self) : 
        #랜덤하게 뽑은 점들이 최대한 퍼져있는 것이 좋다고 판단. 
        x_min = np.min(self.X, axis=0)
        x_max = np.max(self.X, axis=0)
        
        k_point = np.zeros((self.k, self.m))
        for i in range(self.k) : 
            k_point[i] = x_min + i * (x_max-x_min) / self.k
        
        return k_point
    
    def form_cluster(self) :
        cluster = defaultdict(list)
        for i in range(self.n) : 
            vector = np.zeros(self.k)
            for j in range(self.k) :
                vector[j] = np.linalg.norm(self.X[i] - self.k_point[j])
            cluster_num = vector.argmin()
            cluster[cluster_num].append(i) 
        
        return cluster
    
    def find_centroid(self) : 
        centroid = [] 
        for i in range(self.k) : 
            vector = np.mean(self.X[self.cluster[i]], axis=0)
            centroid.append(vector)
        return centroid
    
    def define_centroid(self, epsilon = 1e-20) :
        pre = self.k_point
        self.k_point = self.find_centroid() 
        while np.linalg.norm(np.array(pre) - np.array(self.k_point)) > epsilon : 
            pre = self.k_point
            self.cluster = self.form_cluster()
            self.k_point = self.find_centroid() 
        return self.k_point 
    
    def check_abnormal(self,x_new) : 
        dist = []
        for i in range(self.k) : 
            dist.append(np.linalg.norm(x_new - self.k_point[i])) 
        cluster_num = np.argmin(dist)
        
        abs_dist = dist[cluster_num] 
        re_dist = np.max([np.linalg.norm(self.k_point[cluster_num] - self.X[self.cluster[cluster_num]]) for i in range(self.n)])
        return print("KMC is ", abs_dist/re_dist)
        

In [106]:
test = Cluster(X,3) 
test.check_abnormal([1,2,3,4])

KMC is  0.7128733534306096


## PCA - based Anomaly detection 

#####  기존 PCA 코드 사용. Error 함수 추가 정의 


In [10]:
# 최종본 

class PCA_Detection() : 
    def __init__(self, X, num_eigen) : 
        self.X = X
        self.n = np.shape(X)[0] 
        self.m = np.shape(X)[1]
        
        self.num_eigen = num_eigen
    
    def normarization(self) : 
        mean_X = np.mean(self.X, axis=0)
        return np.array(self.X)-mean_X
    
    def cov(self,metrix): 
        return np.dot(np.array(metrix).T, np.array(metrix))
        
    def eigen(self, metrix):
        eigenvalue, eigenvector = np.linalg.eig(metrix)
        eigenvector_lst = []
        index = np.argsort(eigenvalue)[::-1]
        for i, num in enumerate(index) : 
            eigenvector_lst.append(eigenvector[i, :])
        return np.take(eigenvalue, index), np.array(eigenvector_lst)
    
    def select_eigen(self, num) : 
        if num > self.m : 
            return print("Please lower num under", self.m)
            
        norm_metrix = self.normarization()
        cov_metrix = self.cov(norm_metrix)
        eigenvalue, eigenvector = self.eigen(cov_metrix)
        return eigenvector[:num, :], sum(eigenvalue[:num])/sum(eigenvalue) 
    
    def error(self, x) : 
        w,var = self.select_eigen(self.num_eigen)
        
        error = np.linalg.norm(x)**2 - np.sum([np.dot(x, w[i])**2 for i in range(self.num_eigen)])   
        return error
    


In [11]:
test = PCA_Detection(X,3) 
test.error([1,2,3,4])

26.639794307210636