In [1]:
import sklearn.metrics
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from collections import OrderedDict
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import  matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import numpy as np
import pandas as pd



In [2]:
def bootstrap_limit(stat, alpha=0.05, bootstrap=100):
    '''
        @Description
            Bootstrap sampling을 활용한 Control Limit 산출 기법

        @Parameter
            stat : 통계량 (정상상태의 데이터 입력)
            alpha : Control Limit을 정하기 위한 유의수준 (0~1)
            bootstrap : 샘플링 횟수
        @Return
            limit : 임계값 (CL : Control Limit)
    '''
    alpha = alpha*100
    alpha = 100 - alpha
    samsize = max(100, len(stat))
    
    stat = stat.reshape(len(stat)) # 2차원 array를 1차원 array로 변환
    
    # bootstrap 수 만큼 다음 작업을 반복 : samsize(최소 10000번)만큼 정상상태 데이터를 유의수준 만큼 복원 추출 후 평균 값 사용 
    limit = np.mean(list(map(lambda x:np.percentile(np.random.choice(stat,samsize,replace=True),alpha), range(0,bootstrap))))
    
    return limit


In [73]:
# 오토인코더 모듈 정의
class Autoencoder(nn.Module):
    def __init__(self,X):
        super(Autoencoder, self).__init__()
        data_size = X.size(1)
        self.tr_mu = None
        self.tr_cov = None
        self.cl = None
        
        
        self.encoder = nn.Sequential( 
            nn.Linear(data_size, 32), 
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),   
        )
        self.decoder = nn.Sequential(
            nn.Linear(4, 8), 
            nn.ReLU(),
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, data_size),       
        )

    def get_Score (self, trdat, alpha = 0.05) :
        """

        Parameters
        ----------
        trdat : array
            Train data
        alpha : int, 0~1
            Bootstrap Limit value. The default is 0.05.

        Returns
        -------
        trScore : array
            Train Score, 이상치 점수를 의미함. 클수록 정상패턴에서 벗어남을 의미
        CL : float
            trScore Control Limit

        """
        if isinstance(trdat,(np.ndarray)):
            trdat = pd.DataFrame(trdat)
            
        Score = np.zeros((trdat.shape[0], 1))
        self.tr_mu = trdat.mean(axis = 0)
        self.tr_cov = trdat.cov()
        for i in range(len(trdat)):
            Score[i] = (trdat.values[i] - self.tr_mu) @ np.linalg.pinv(self.tr_cov) @ (trdat.values[i] - self.tr_mu).transpose()
        
        self.cl = bootstrap_limit(Score, alpha=0.05, bootstrap=100)
       
        return {'Score' : Score}
    
    
    
    
    def fit (self, epochs, loader, criterion, optimizer):        
        
        for epoch in range(epochs):
            for step ,(x) in enumerate(loader):
                x = x[0]
                y = x
                encoded =  self.encoder(x)
                decoded =  self.decoder(encoded)
                loss = criterion(decoded , y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
    
    def CL_printor(self) :
        """
        
        Returns
        -------
        CL: float
            Control Limit,
            
        """
        
        return {'CL' : self.cl}
    
    
    def predict(self, x):
        encoded = self.encoder(x) 
        decoded = self.decoder(encoded)  
            
        return  decoded 
                

In [74]:
def AE(trdat, tsdat, epochs):
    '''
    
        @Input 
            X_train : type : array
            y_train : type : array
            X_test : type : array
            
            epochs : type : int
            
        @Output
            red_X_test : type : array
            
    '''
    # 데이터를 파이토치 텐서로 변경
    X_train = torch.Tensor(trdat)
    X_test= torch.Tensor(tsdat)
    
    # 학습을 위한 데이터 전처리 
    dataset = TensorDataset(X_train)
    loader= DataLoader(dataset, batch_size=1, shuffle = True)
    
    #CL을 선정하기위한 신경망 생성하기 
    model = Autoencoder(X_train)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
    criterion = nn.MSELoss()
    model.fit(epochs, loader, criterion, optimizer)
    
    #잔차 구하기
    pred_X_train = model.predict(X_train)
    red_X_train = pred_X_train - X_train
    red_X_train = red_X_train.detach().numpy()
    
    #정상 데이터를 이용한 trScore와 CL 계산
    trScore = model.get_Score(red_X_train, alpha = 0.05)
    CL = model.CL_printor()
    
    #학습되 모델 저장
    torch.save(model, "AE.pt")
    
    # 학습을 위한 데이터 전처리 
    test_dataset = TensorDataset(X_test)
    test_loader= DataLoader(test_dataset, batch_size=1, shuffle = True)
    
    #모델 신경망 생성하기 
    test_model = Autoencoder(X_test)
    test_optimizer = torch.optim.Adam(test_model.parameters(), lr = 0.005)
    test_criterion = nn.MSELoss()
    model.fit( epochs, test_loader, test_criterion, test_optimizer)
    
    
    
    #잔차 구하기
    pred_X_test = model.predict(X_test)
    red_X_test = pred_X_test - X_test
    red_X_test = red_X_test.detach().numpy()
    
    #정상 데이터를 이용한 trScore와 CL 계산
    tsScore = test_model.get_Score(red_X_test, alpha = 0.05)
    
    
    return {'trScore':trScore['Score'], 'tsScore':tsScore['Score'], 'CL': CL['CL']}


In [75]:
# Autoencoder\ Model load
def AE_model_loader(pickleFile, tsdat) :
    """
    저장한 모델을 로드한 후, 로드한 모델과 데이터를 활용해 분석 결과 리턴

    Parameters
    ----------
    model : ?
        로드한 모델
    tsdat : array
        예측 데이터

    Returns
    -------
    모델 리턴과 동일

    """
    model = torch.load(pickleFile)
    CL = model.CL_printor()
    tsdat= torch.Tensor(tsdat)
    pred = model.predict(tsdat)
    red = pred - tsdat
    red = red.detach().numpy()
    Score = model.get_Score (red, alpha = 0.05)
        
    return {'Score' : Score['Score'], 'CL' : CL['CL']}

In [76]:
df = pd.read_csv('test_data.csv', encoding='euc-kr')
trdat = df.values[0:600,:]
tsdat = df.values[600:1000, :]


In [77]:
red_X_test = AE(trdat, tsdat, 10)

In [78]:
xxx = AE_model_loader("AE.pt",tsdat)

In [79]:
xxx

{'Score': array([[ 31.68841658],
        [ 18.44954985],
        [ 24.08155884],
        [ 21.14833166],
        [ 20.16774337],
        [ 16.97279612],
        [ 28.52523303],
        [ 15.91412421],
        [ 32.53723571],
        [256.6704979 ],
        [398.1889721 ],
        [204.88360741],
        [175.08548004],
        [ 22.19771495],
        [ 67.2780101 ],
        [ 14.78656493],
        [ 22.83291302],
        [ 21.73447019],
        [ 22.37762662],
        [ 16.65410539],
        [154.8948975 ],
        [ 21.36197181],
        [ 19.00018979],
        [ 24.13301429],
        [ 21.90089191],
        [ 21.59163554],
        [ 17.43270837],
        [398.23360143],
        [ 21.80551694],
        [ 23.9088382 ],
        [ 25.35463328],
        [150.34472326],
        [ 10.44939202],
        [ 10.81793601],
        [398.14604233],
        [122.95635918],
        [ 14.0850182 ],
        [ 40.8174974 ],
        [ 23.63652109],
        [ 41.30318864],
        [ 18.19053503],
       

In [45]:
red_X_test

{'trScore': array([[ 33.09383454],
        [ 22.57339531],
        [ 88.03852476],
        [135.9567892 ],
        [158.09247114],
        [ 13.88024579],
        [ 11.22479682],
        [ 15.02265344],
        [ 58.76302429],
        [ 28.36235483],
        [ 75.26830825],
        [ 20.51926972],
        [ 18.44010162],
        [ 51.69123014],
        [ 27.97952261],
        [ 41.09479704],
        [ 19.06506878],
        [ 12.53056467],
        [ 78.92442286],
        [ 18.40228351],
        [  9.93426365],
        [505.88591238],
        [ 20.90626497],
        [ 47.62610611],
        [ 14.78926709],
        [ 20.23173187],
        [ 21.21895416],
        [ 17.04344923],
        [304.09780489],
        [ 16.02257832],
        [ 20.22132546],
        [ 35.11635757],
        [  5.6145395 ],
        [ 25.25507897],
        [ 25.29509809],
        [ 16.98218388],
        [ 16.0306991 ],
        [ 57.33081754],
        [ 70.92552335],
        [ 13.0634804 ],
        [ 93.87560366],
     