# 多维异常检测算法仿真（算法实践）

## 1、数据采集

> 读取 CPU\磁盘读\磁盘写\网络出口\网络入口\内存等监控指标，数据来源：广西科大数据（2017.1-2017.2）

In [24]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('figure', figsize=(15, 6))
from dateutil.parser import parse
from sklearn.metrics import r2_score

In [25]:
vCpuUsage = pd.read_excel('../ECUST data/Guangxi university data 20170228/CPU_20170228171221.xlsx',converters={u'时间':parse})
vDiskRead = pd.read_excel('../ECUST data/Guangxi university data 20170228/DiskRead_20170228171404.xlsx',converters={u'时间':parse})
vDiskWrite = pd.read_excel('../ECUST data/Guangxi university data 20170228/DiskWrite_20170228171432.xlsx',converters={u'时间':parse})
vNwEgress = pd.read_excel('../ECUST data/Guangxi university data 20170228/NwEgress_20170228171526.xlsx',converters={u'时间':parse})
vNwIngress = pd.read_excel('../ECUST data/Guangxi university data 20170228/NwIngress_20170228171623.xlsx',converters={u'时间':parse})
vMemUsage = pd.read_excel('../ECUST data/Guangxi university data 20170228/Memory_20170228171333.xlsx',converters={u'时间':parse})

## 2.数据预处理

In [26]:
import time
start=time.time()

### 2.1、修改index和columns
for var in (vCpuUsage,vDiskRead,vDiskWrite,vNwEgress,vNwIngress,vMemUsage):
    var.rename(columns={u'资源':'vres',u'类型':'vtype',u'时间':'vtime',u'最大值':'vmax',u'最小值':'vmin',u'平均值':'vavg',u'单位':'vunit'},
               inplace = True)
    if 'vtime' in var.columns.values:
        var.set_index('vtime',inplace=True) 

### 2.2、初步探索时间序列数据,形成待分析多维数据矩阵X
X = pd.concat([vCpuUsage.to_period('Min').vavg,
               vDiskRead.to_period('Min').vavg,
               vDiskWrite.to_period('Min').vavg,
               vNwEgress.to_period('Min').vavg,
               vNwIngress.to_period('Min').vavg,
               vMemUsage.to_period('Min').vavg],axis=1,keys=['vCpuUsage','vDiskRead','vDiskWrite','vNwEgress','vNwIngress','vMemUsage'])

### 2.3、对缺失数据进行插值处理
#设定初始值后，对NaN进行线性插值
X.ix[0,X.ix[0].isnull()]=0
X.interpolate(method='time',inplace=True)

### 2.4、对CPU 0值数据进行填充
for i in range(1,len(X.vCpuUsage)):
    if X.vCpuUsage[i]==0:
        X.vCpuUsage[i] = X.vCpuUsage[i-1] 

#保留原始值
X_Original=X

### 2.5、无量纲化
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

X = DataFrame(MinMaxScaler().fit_transform(X_Original),index=X_Original.index,columns=X_Original.columns)

#区间缩放后再将均值0化，这主要是由于部分算法会自行对均值进行处理（比如pca的transform），为避免算法理解上的干扰，调整均值为0
X_mean=X.mean()
X_std=X.std()
X = X-X_mean

## 3、PCA降维分析

In [27]:
from sklearn.decomposition import PCA
import numpy.linalg as nlg

### 3.1、PCA数据探索
X_pca=PCA().fit(X)

### 3.2、确定主成分个数
#指定主成分的方差和所占的最小比例阈值为0.85
X_pca=PCA(n_components=0.85).fit(X)

## 4、ICA独立元分析方法的异常检测应用

In [28]:
### 4.1、ICA
from sklearn.decomposition import FastICA

X_ica=FastICA(n_components=X_pca.n_components_).fit(X)
X_S_=X_ica.transform(X)

#各种算法的变量统一，便于后续计算
X_ica_mixing_=X_ica.mixing_
X_ica_mean_=X_ica.mean_

X_ica_recover=DataFrame(np.dot(X_S_,X_ica_mixing_.T)+X_ica_mean_,index=X.index,columns=X.columns)

### 4.2、构造T2和SPE统计量
#计算T2统计量
X_ica_T2=Series(np.sum(X_S_**2,axis=1),index=X.index)

#计算SPE统计量
X_ica_SPE=Series(np.sum((X-X_ica_recover)**2,axis=1),index=X.index)

### 4.3、采用KDE方法，利用置信度确定阈值
from scipy import stats

def my_kde_bandwidth(obj, fac=1./2):
    """We use Scott's Rule, multiplied by a constant factor."""
    return np.power(obj.n, -1./(obj.d+4)) * fac

X_ica_T2_scipy_kde=stats.gaussian_kde(X_ica_T2, bw_method=my_kde_bandwidth)
X_ica_SPE_scipy_kde=stats.gaussian_kde(X_ica_SPE, bw_method=my_kde_bandwidth)

def get_threshold_of_scipy_kde(kde,start,step=1,confidence=0.997):
    """get threshold by confidence"""
    i = start
    cumsum = kde.integrate_box_1d(-np.inf, start)
    while True:
        if cumsum >= confidence:
            break
        cumsum = cumsum + kde.integrate_box_1d(i, i+step)
        i = i + step
        
    return i

# 通过概率密度函数求解概率时的累加步长设置(中位数与最大值距离100步)
X_ica_T2_pdf_step=(X_ica_T2.max()-X_ica_T2.median())/100
X_ica_SPE_pdf_step=(X_ica_SPE.max()-X_ica_SPE.median())/100

X_ica_T2_threshold=get_threshold_of_scipy_kde(X_ica_T2_scipy_kde,X_ica_T2.min(),step=X_ica_T2_pdf_step,confidence=0.997)
X_ica_SPE_threshold=get_threshold_of_scipy_kde(X_ica_SPE_scipy_kde,X_ica_SPE.min(),step=X_ica_SPE_pdf_step,confidence=0.997)

### 4.4、检测到的异常时刻

#### 4.4.1、T2检测
X_ica_T2_anomaly=X_ica_T2[X_ica_T2>X_ica_T2_threshold].index
#10min聚合，注意第一个元素的处理
indice=pd.Series([True]+list(np.diff(X_ica_T2_anomaly)>10))

X_ica_T2_anomaly_start=X_ica_T2_anomaly[indice].tolist()
X_ica_T2_anomaly_end=X_ica_T2_anomaly[indice.shift(-1).fillna(False)].tolist()
X_ica_T2_anomaly_end.append(X_ica_T2_anomaly[-1])

print('anomal periods detected by T2 metric are: ')
for each in zip(X_ica_T2_anomaly_start,X_ica_T2_anomaly_end):    
    print(each)

#### 4.4.2、SPE检测
X_ica_SPE_anomaly=X_ica_SPE[X_ica_SPE>X_ica_SPE_threshold].index
indice=pd.Series([True]+list(np.diff(X_ica_SPE_anomaly)>10))

X_ica_SPE_anomaly_start=X_ica_SPE_anomaly[indice].tolist()
X_ica_SPE_anomaly_end=X_ica_SPE_anomaly[indice.shift(-1).fillna(False)].tolist()
X_ica_SPE_anomaly_end.append(X_ica_SPE_anomaly[-1])

print('anomal periods detected by SPE metric are: ')
for each in zip(X_ica_SPE_anomaly_start,X_ica_SPE_anomaly_end):
    print(each)

print('算法用时：',time.time()-start,'s')

anomal periods detected by T2 metric are: 
(Period('2017-01-12 15:44', 'T'), Period('2017-01-12 15:44', 'T'))
(Period('2017-01-12 16:05', 'T'), Period('2017-01-12 16:05', 'T'))
(Period('2017-01-14 16:13', 'T'), Period('2017-01-14 16:53', 'T'))
(Period('2017-02-15 15:45', 'T'), Period('2017-02-15 17:48', 'T'))
(Period('2017-02-20 10:31', 'T'), Period('2017-02-20 10:31', 'T'))
(Period('2017-02-20 11:01', 'T'), Period('2017-02-20 11:16', 'T'))
(Period('2017-02-20 11:30', 'T'), Period('2017-02-20 11:38', 'T'))
(Period('2017-02-20 11:49', 'T'), Period('2017-02-20 12:03', 'T'))
(Period('2017-02-20 16:25', 'T'), Period('2017-02-20 17:21', 'T'))
(Period('2017-02-25 11:05', 'T'), Period('2017-02-25 11:05', 'T'))
(Period('2017-02-28 11:37', 'T'), Period('2017-02-28 11:41', 'T'))
anomal periods detected by SPE metric are: 
(Period('2017-01-16 10:22', 'T'), Period('2017-01-16 10:32', 'T'))
(Period('2017-01-18 09:50', 'T'), Period('2017-01-18 10:09', 'T'))
(Period('2017-01-18 10:44', 'T'), Period('