In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import itertools
from sklearn.pipeline import Pipeline

In [None]:
# 导入数据
df = pd.read_csv('./process_data/train.csv')
keys = np.array(df.columns)
dic = {}
for colname in df:
    arr = df.loc[:,colname].values
    dic[colname] = arr
dic[keys[-1]] = dic[keys[-1]] > 1400

# 构建数据集与标签集
data_set = dic[keys[1]].reshape(-1,1)
for i in range(2,len(keys)-1):
    data_set = np.append(data_set, dic[keys[i]].reshape(-1,1), axis = 1)
label_set = dic[keys[-1]]
data_set = data_set[np.lexsort(-data_set.T[0, None])]

# 数据集划分
scaler = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(data_set, label_set, test_size=0.3, shuffle= False)
xx_train, xx_val, yy_train, yy_val = train_test_split(x_train, y_train, test_size=0.2, shuffle= False)

xx_train = scaler.fit_transform(xx_train)
xx_val = scaler.fit_transform(xx_val)

In [None]:
# 依据Fisher准则进行特征选择
J_fisher = []
feature_num = [5, 10, 20, 30, 40, 50, xx_train.shape[1]]
Model = MLPClassifier(solver= 'adam', activation= 'relu', 
        batch_size= 64, learning_rate= 'adaptive', shuffle= True,
        alpha= 0.1, hidden_layer_sizes= 50, learning_rate_init= 1e-4, max_iter=1000)

# 基于样本类别进行分类
idx_0 = np.argwhere(yy_train == False)
idx_1 = np.argwhere(yy_train == True)
X_train_0 = np.mat(xx_train[idx_0])
X_train_1 = np.mat(xx_train[idx_1])

# 基于类内类间距离的判据 and 基于最大信息系数的判据
for i in range(xx_train.shape[1]):
    mu_0 = np.mean(X_train_0[:,i])
    mu_1 = np.mean(X_train_1[:,i])
    sigma_0 = np.var(X_train_0[:,i])
    sigma_1 = np.var(X_train_1[:,i])
    J = (mu_0 - mu_1)**2/(sigma_0**2 + sigma_1**2)
    J_fisher.append(J)

In [None]:
# 根据目标特征数进行选择
for i in feature_num:
    feature_f = []
    J_f = np.argsort(J_fisher)
    feature_f = J_f[-i:]

    #计算准确率
    X_train_fselected = xx_train[:,feature_f]
    X_test_fselected = xx_val[:,feature_f]
    Model.fit(X_train_fselected, yy_train)
    Y_pred_f_v = Model.predict(X_test_fselected)
    Y_pred_f_t = Model.predict(X_train_fselected)
    acc_f_v = accuracy_score(Y_pred_f_v, yy_val)
    acc_f_t = accuracy_score(Y_pred_f_t, yy_train)

    print(str(i),"features, train acc：", acc_f_t)
    print(str(i),"features, val acc：", acc_f_v)
    print(str(i),"features index",feature_f)

In [None]:
feature = J_f[-40:]
x_train = x_train[:,feature]
x_test = x_test[:,feature]

In [None]:
# 模型训练

#k折分层抽样
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=233)

#实例化svm模型，网格化参数精细调参
params = {'mlp__alpha': [1e-3, 1e-2, 1e-1, 1], 'mlp__hidden_layer_sizes': [10, 20, 30, 40, 50], 'mlp__learning_rate_init': [1e-4, 1e-3, 1e-2]}
clf = Pipeline([('ss', StandardScaler()), ('mlp', MLPClassifier(solver= 'adam', activation= 'relu', batch_size= 64, learning_rate= 'adaptive', shuffle= True, max_iter= 1000))])
scores = ('accuracy', 'precision', 'f1', 'recall', 'roc_auc')
gs = GridSearchCV(clf, params, refit= 'accuracy', verbose= 1, cv= skf, scoring= scores)
gs.fit(x_train, y_train)
print('best params: ',gs.best_params_)
print('best score: ', gs.best_score_)

In [None]:
np.save('./result/mlp_cv_results.npy', gs.cv_results_)

In [None]:
# 模型预测
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)
model1 = MLPClassifier(solver= 'adam', activation= 'relu', batch_size= 64, learning_rate= 'adaptive', shuffle= True, max_iter= 1000,
                alpha= 0.1, hidden_layer_sizes= 50, learning_rate_init= 1e-4)
model1.fit(x_train, y_train)
y_pred1 = model1.predict(x_test)