In [1]:
# import modules
import pandas
from sklearn import svm
import tensorflow as tf
import numpy as np
from sklearn.decomposition import PCA
from pylab import *
import struct
import keras as ks
import logging
from keras.layers import Dense, Activation, Flatten, Convolution2D
from keras.utils import np_utils
from keras.models import model_from_json
from keras import backend as K

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

import matplotlib.pyplot as plt
from skimage import io
import numpy as np
from PIL import Image 
from scipy import misc
import os
# ...

Using TensorFlow backend.


## 数据读取和预处理

In [3]:
# 3888 * 51 with 4 labels
data_raw = pandas.read_csv("data/host10280-labeled.csv")
data = np.array(data_raw)
data_raw = pandas.read_csv("data/host10274-labeled.csv")
data = np.concatenate((data,np.array(data_raw)),axis=0)
data_raw = pandas.read_csv("data/host10283-labeled.csv")
data = np.concatenate((data,np.array(data_raw)),axis=0)

# 51 = 2 ids + 45 features + 4 labels
# ids: host + clock
# labels: normal, cpu, mem, io
data_features = data[:,2:47]
data_labels = data[:,47:51]

data_class = [] # 0 ~ 3
for label in data_labels:
    data_class.append(np.dot([0,1,2,3],label))
data_class = np.array(data_class,dtype="int")
# cpu: 180
# mem: 180
# io : 180
# normal: 3348

# select 180 normal samples randomly
data_tmp = data_features[data_class == 0]
index_tmp = np.arange(0,data_tmp.shape[0],data_tmp.shape[0]/180)[0:180]
data_normal_tmp = data_tmp[index_tmp]

# features : 720(180*4) * 45
# labels   : 720(180*4) * 4
# class    : 720(180*4) * 1
data_features = np.concatenate((data_normal_tmp,data_features[data_class != 0]),axis=0)
data_labels = np.concatenate((np.array([[1,0,0,0]]*180),data_labels[data_class != 0]),axis=0)
data_class = np.concatenate((np.array([0]*180),data_class[data_class != 0]),axis=0)

# feature scaling
for i in range(data_features.shape[1]):
    d_min = data_features[:,i].min()
    d_max = data_features[:,i].max()
    if d_min == d_max:
        data_features[:,i] = 1
        continue
    data_features[:,i] -= d_min
    data_features[:,i] /= (d_max - d_min)

## 划分训练集和数据集

In [4]:
index_test = np.arange(0,720,5)
index_train = np.array(list(set(np.arange(0,720,1)) - set(index_test)))
data_train = data_features[index_train]
labels_train = data_labels[index_train]
class_train = data_class[index_train]
data_test = data_features[index_test]
labels_test = data_labels[index_test]
class_test = data_class[index_test]
print(data_train.shape)
print(labels_train.shape)
print(class_train.shape)
print(data_test.shape)
print(labels_test.shape)
print(class_test.shape)

(576, 45)
(576, 4)
(576,)
(144, 45)
(144, 4)
(144,)


## SVM

In [5]:
clf = svm.SVC()
clf.fit(data_train, class_train)
print(clf.score(data_train, class_train))
print(clf.score(data_test, class_test))

0.954861111111
0.916666666667


## 神经网络
45 => 128 => 64 => 4

In [6]:
model = ks.models.Sequential()
model.add(Dense(128, input_dim=data_features.shape[1]))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(4))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
model.fit(x=data_train,y=labels_train,batch_size=100,nb_epoch=50,verbose=1,validation_data=(data_test,labels_test))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead




Train on 576 samples, validate on 144 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1c23f42a90>

## 神经网络 + SVM
45 => 128 => 64 => SVM

In [22]:
get_feature = K.function([model.layers[0].input],[model.layers[2].output])
mid_features_train = get_feature([data_train])[0]
mid_features_test  = get_feature([data_test])[0]
clf = svm.SVC()
clf.fit(mid_features_train, class_train)
print(clf.score(mid_features_train, class_train))
print(clf.score(mid_features_test, class_test))

0.972222222222
0.9375


## 其他机器学习算法
- KNN
- 线性SVM
- 基于核函数的SVM (RBF SVM), Radial basis function kernel
- 决策树
- 随机森林

In [21]:
classifiers = [
    KNeighborsClassifier(3),
    svm.SVC(kernel="linear", C=0.025),
    svm.SVC(),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    ]
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest"]
for name, clf in zip(names,classifiers):
    clf.fit(data_train,class_train)
    print(name)
    print("train: {}".format(clf.score(data_train, class_train)))
    print("test: {}".format(clf.score(data_test, class_test)))

Nearest Neighbors
train: 0.991319444444
test: 0.9375
Linear SVM
train: 0.949652777778
test: 0.916666666667
RBF SVM
train: 0.954861111111
test: 0.916666666667
Decision Tree
train: 0.996527777778
test: 0.9375
Random Forest
train: 0.993055555556
test: 0.9375


In [10]:
for res in model.predict(data_test):
    if res[0] < 0.9 and res[1] < 0.9 and res[2] < 0.9 and res[3] < 0.9:
        print(res)

[ 0.89499915  0.06281427  0.03863219  0.00355432]
[ 0.89310551  0.05416692  0.04868099  0.00404656]
[ 0.89843506  0.05573963  0.04200708  0.00381828]
[ 0.89838797  0.05358611  0.04370584  0.00432   ]
[ 0.10541886  0.00658347  0.0701194   0.81787819]
[ 0.8179487   0.04564025  0.1304263   0.00598465]
[ 0.8330397   0.0987699   0.04428348  0.02390696]
[ 0.23016171  0.75411671  0.01306443  0.00265717]
[ 0.44072691  0.04287855  0.51088846  0.00550609]
[ 0.28720668  0.69722545  0.00956561  0.0060023 ]
[ 0.10918946  0.00907711  0.01240001  0.86933339]
[ 0.35860217  0.62206674  0.00860362  0.0107275 ]
[ 0.08568739  0.01609712  0.88738692  0.01082858]
