In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import sklearn.svm as svm
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import itertools

In [None]:
### 导入数据
df = pd.read_csv('./process_data/train.csv')
keys = np.array(df.columns)
dic = {}
for colname in df:
    arr = df.loc[:,colname].values
    dic[colname] = arr

# 构建数据集与标签集
data_set = dic[keys[1]].reshape(-1,1)   # 数据集
for i in range(2,len(keys)-1):
    data_set = np.append(data_set, dic[keys[i]].reshape(-1,1), axis = 1)
label_set = dic[keys[-1]]               # 标签集
data_set = data_set[np.lexsort(-data_set.T[0, None])]

# 数据集划分         
x_train, x_test, y_train, y_test = train_test_split(data_set, label_set, test_size=0.3, shuffle= False)     # 训练集与测试集按照7：3比例划分
xx_train, xx_val, yy_train, yy_val = train_test_split(x_train, y_train, test_size=0.2, shuffle= False)      # 训练集中取五分之一作为验证集，与5折交叉验证比例相同

In [None]:
# 数据预处理
## 分别对训练集、验证集和测试集进行数据标准化
scaler = StandardScaler()     
x_train = scaler.fit_transform(x_train)     # 完整训练集
xx_train = scaler.fit_transform(xx_train)   # 除去验证集后的训练集
xx_val = scaler.fit_transform(xx_val)       # 验证集
x_test = scaler.fit_transform(x_test)       # 测试集

In [None]:
# 依据Fisher准则进行特征选择
J_fisher = []
feature_num = [5, 10, 20, 30, 50, xx_train.shape[1]]
Model = svm.SVC(kernel='rbf', C=1, gamma=0.01)

# 基于样本类别进行分类
idx_0 = np.argwhere(yy_train == False)
idx_1 = np.argwhere(yy_train == True)
X_train_0 = np.mat(xx_train[idx_0])
X_train_1 = np.mat(xx_train[idx_1])

# 基于类内类间距离的判据计算每个特征的投影距离
for i in range(xx_train.shape[1]):
    mu_0 = np.mean(X_train_0[:,i])
    mu_1 = np.mean(X_train_1[:,i])
    sigma_0 = np.var(X_train_0[:,i])
    sigma_1 = np.var(X_train_1[:,i])
    J = (mu_0 - mu_1)**2/(sigma_0**2 + sigma_1**2)
    J_fisher.append(J)

In [None]:
# 根据目标特征数k选择距离最远的前k个特征进行学习
for i in feature_num:
    feature_f = []
    J_f = np.argsort(J_fisher)
    feature_f = J_f[-i:]

    # 计算k个特征下的准确率
    X_train_fselected = xx_train[:,feature_f]
    X_test_fselected = xx_val[:,feature_f]
    Model.fit(X_train_fselected, yy_train)
    Y_pred_f_v = Model.predict(X_test_fselected)
    Y_pred_f_t = Model.predict(X_train_fselected)
    acc_f_v = accuracy_score(Y_pred_f_v, yy_val)
    acc_f_t = accuracy_score(Y_pred_f_t, yy_train)

    print(str(i),"features, train acc：", acc_f_t)
    print(str(i),"features, val acc：", acc_f_v)
    print(str(i),"features index",feature_f)

In [None]:
# 依据验证结果选择特征
feature = J_f[-50:]
x_train = x_train[:,feature]
x_test = x_test[:,feature]

In [None]:
# 模型训练与网格化参数搜索

#k折分层抽样
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=233)

#实例化svm模型
params = {'svm__gamma': [0.001, 0.005, 0.01, 0.005, 0.1, 1], 'svm__C': [1, 3, 5, 10, 20]}
clf = Pipeline([('ss', StandardScaler()), ('svm', svm.SVC(kernel='rbf'))])
gs = GridSearchCV(clf, params, refit= 'accuracy', verbose= 1, cv= skf)
gs.fit(x_train, y_train)
print('best params: ',gs.best_params_)
print('best score: ', gs.best_score_)

In [None]:
# 保存网格化搜索结果
np.save('./result/svm_cv_results.npy', gs.cv_results_)

In [None]:
## 依次比较五项评价指标结果
res = gs.cv_results_
idx = {}

In [None]:
### 模型预测
## {'svm__C': 1, 'svm__gamma': 0.01}
model1 = svm.SVC(kernel='rbf', C=1, gamma=0.01)
model1.fit(x_train, y_train)
y_pred1 = model1.predict(x_test)