In [65]:
"""
Created on Wed Jul  6 14:29:39 2022
@author: suhong

Revised on Thur Jul  7 16:20:39 2022
@revised : Junhyun

"""

import numpy as np
import pandas as pd
import joblib

def bootstrap_limit(stat, alpha=0.05, bootstrap=100):
    '''
        @Description
            Bootstrap sampling을 활용한 Control Limit 산출 기법

        @Parameter
            stat : 통계량 (정상상태의 데이터 입력)
            alpha : Control Limit을 정하기 위한 유의수준 (0~1)
            bootstrap : 샘플링 횟수
        @Return
            limit : 임계값 (CL : Control Limit)
    '''
    alpha = alpha*100
    alpha = 100 - alpha
    samsize = max(100, len(stat))
    
    stat = stat.reshape(len(stat)) # 2차원 array를 1차원 array로 변환
    
    # bootstrap 수 만큼 다음 작업을 반복 : samsize(최소 10000번)만큼 정상상태 데이터를 유의수준 만큼 복원 추출 후 평균 값 사용 
    limit = np.mean(list(map(lambda x:np.percentile(np.random.choice(stat,samsize,replace=True),alpha), range(0,bootstrap))))
    
    return limit

# import Hotellings_tsquare

class Hotellings_tsquare ():
    
    """
    Hotellings T square
    """
    
    def __init__(self) :
        
        self.tr_mu = None
        self.tr_cov = None
        self.cl = None
        
    def fit(self, trdat, alpha = 0.05) :
        """

        Parameters
        ----------
        trdat : array
            Train data
        alpha : int, 0~1
            Bootstrap Limit value. The default is 0.05.

        Returns
        -------
        trScore : array
            Train Score, 이상치 점수를 의미함. 클수록 정상패턴에서 벗어남을 의미
        CL : float
            trScore Control Limit

        """
        if isinstance(trdat,(np.ndarray)):
            trdat = pd.DataFrame(trdat)
            
        trScore = np.zeros((trdat.shape[0], 1))
        self.tr_mu = trdat.mean(axis = 0)
        self.tr_cov = trdat.cov()
        for i in range(len(trdat)):
            trScore[i] = (trdat.values[i] - self.tr_mu) @ np.linalg.pinv(self.tr_cov) @ (trdat.values[i] - self.tr_mu).transpose()
        
        self.cl = bootstrap_limit(trScore, alpha=alpha, bootstrap=100)
        
        return {'trScore' : trScore, 'CL' : self.cl}
    
    def predict(self, tsdat) :
        """

        Parameters
        ----------
        tsdat : array
            Test data. 예측 대상이 되는 데이터

        Returns
        -------
        tsScore : array
            Test data의 이상치 값

        """
        if isinstance(tsdat,(np.ndarray)):
            tsdat = pd.DataFrame(tsdat)
            
        tsScore = np.zeros((tsdat.shape[0], 1))
        
        for i in range(len(tsdat)):
            tsScore[i] = (tsdat.values[i] - self.tr_mu) @ np.linalg.pinv(self.tr_cov) @ (tsdat.values[i] - self.tr_mu).transpose()
            
        return {'tsScore' : tsScore}

In [71]:
def tsquare(trdat, tsdat, alpha=0.05):
    """

    Parameters
    ----------
    trdat : array
        Train data. 학습 대상이 되는 데이터
    tsdat : array
        Test data. 예측 대상이 되는 데이터
    n_clusters : int
        클러스터의 개수
    alpha : float, 0~1
            Bootstrap Limit value. The default is 0.05.

    Returns
    -------
    trScore : array
        Train data의 이상치 값
    tsScore : array
        Test data의 이상치 값
    CL : float 
        Control Limit

    """
    model = Hotellings_tsquare()
    fit = model.fit(trdat, alpha=alpha)
    pred = model.predict(tsdat)
    
    # tsquare model pickle 파일로 저장
    saved_model = joblib.dump(model, 't2.pkl')
    
    return {'trScore':fit['trScore'], 'tsScore':pred['tsScore'], 'CL': fit['CL']}
        

In [72]:
df = pd.read_csv('E:\\연구실\\연구과제\\엑센솔루션\\test_data.csv', encoding='euc-kr')

trdat = df.values[0:600,:]
tsdat = df.values[600:1000, :]

tsq = tsquare(trdat, tsdat, alpha=0.05)

In [73]:
# load pickle file
t2 = joblib.load('t2.pkl') 
t2.predict(tsdat)

{'tsScore': array([[2.45164896e+01],
        [1.06513132e+01],
        [6.55939599e+00],
        [6.46153251e+00],
        [5.98061603e+00],
        [1.00428378e+01],
        [1.72279354e+01],
        [6.95602639e+00],
        [2.10220284e+01],
        [5.27900867e+02],
        [1.00241881e+03],
        [1.98938567e+02],
        [1.67881365e+02],
        [1.01443180e+01],
        [5.75803467e+01],
        [8.11852784e+00],
        [8.24774917e+00],
        [8.15645656e+00],
        [5.91536482e+00],
        [6.39909739e+00],
        [1.17584801e+02],
        [9.23335323e+00],
        [6.48421594e+00],
        [5.39296017e+00],
        [6.64175384e+00],
        [9.71087954e+00],
        [8.54933230e+00],
        [2.51588239e+04],
        [1.53810362e+01],
        [1.22180533e+01],
        [8.36726173e+00],
        [6.26675056e+02],
        [7.61897724e+00],
        [4.48896737e+00],
        [1.80323920e+04],
        [1.41961093e+02],
        [5.54152778e+00],
        [2.17999207e+01],
 