In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import sklearn.svm as svm
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [2]:
# 导入数据
df = pd.read_csv('./OnlineNewsPopularity/OnlineNewsPopularity.csv')
keys = np.array(df.columns)
dic = {}
for colname in df:
    arr = df.loc[:,colname].values
    dic[colname] = arr
dic[keys[-1]] = dic[keys[-1]] > 1400

In [3]:
### 数据集划分

# 构建数据集与标签集
data_set = dic[keys[1]].reshape(-1,1)
for i in range(2,len(keys)-1):
    data_set = np.append(data_set, dic[keys[i]].reshape(-1,1), axis = 1)
label_set = dic[keys[-1]]
data_set = data_set[np.lexsort(-data_set.T[0, None])]

# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(data_set, label_set, test_size=0.3, shuffle= False)

In [5]:
### 模型训练

#k折分层抽样
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=233)

#实例化svm模型并进行网格化参数搜索
params = {'mlp__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'mlp__hidden_layer_sizes': [10, 20, 50, 100], 'mlp__activation': ['tanh', 'relu'], 'mlp__solver': ['adam', 'lbfgs', 'sgd']}
clf = Pipeline([('ss', StandardScaler()), ('mlp', MLPClassifier(batch_size= 64, learning_rate= 'adaptive', shuffle= True))])
scores = ('accuracy', 'precision', 'f1', 'recall', 'roc_auc')
gs = GridSearchCV(clf, params, refit= 'accuracy', verbose= 1, cv= skf, scoring= scores)
gs.fit(x_train, y_train)
print('best params: ',gs.best_params_)
print('best score: ', gs.best_score_)
print(gs.cv_results_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

best params:  {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
best score:  0.6525045045045045
{'mean_fit_time': array([10.80542221,  1.83624573, 10.38318391, 12.95938048,  3.11578979,
       11.31318355, 21.51647806,  7.70801687, 18.85109105, 29.30165524,
       14.33424063, 22.95714965, 11.40391502,  1.65682344, 10.37058053,
       12.95297041,  2.92102261, 11.3348279 , 20.47785802,  6.97348485,
       17.17954416, 27.09633451, 13.42466021, 22.61596718,  9.96793876,
        1.65817542, 10.11603065, 12.8900806 ,  2.9406466 , 11.0442008 ,
       20.99703541,  7.10052767, 17.6589417 , 27.68962483, 13.85109286,
       23.17241249,  9.1537159 ,  1.67849879, 10.56467166, 10.71937218,
        2.9411222 , 11.13545866, 19.51032405,  6.94461694, 17.33006806,
       27.79931993, 14.02309365, 23.38255072,  5.84389997,  1.87908506,
       10.43780284,  8.1781476 ,  3.13515501, 11.84136982, 12.86209764,
        7.37752109, 18.44581342, 17.46805673



In [6]:
np.save('./result/mlp_cv_results_1219_120.npy', gs.cv_results_)

In [7]:
## 依次比较五项评价指标结果
res = gs.cv_results_
idx = {}

In [19]:
# 比较结果，顺次打印当前指标下每项指标的最优匹配参数的结果
key = 'test_accuracy'
#print(res["mean_" + key])
idx[key] = np.argmax(res["mean_" + key])
for k in idx:
    print(k, res["mean_" + key][idx[k]])
    print(gs.cv_results_['params'][idx[k]])

test_accuracy 0.6525045045045045
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
test_precision 0.6484324324324324
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'adam'}
test_f1 0.6461261261261262
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_recall 0.6461261261261262
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_roc_auc 0.6488288288288288
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 10, 'mlp__solver': 'adam'}


In [18]:
# 比较结果
key = 'test_precision'
#print(res["mean_" + key])
idx[key] = np.argmax(res["mean_" + key])
for k in idx:
    print(k, res["mean_" + key][idx[k]])
    print(gs.cv_results_['params'][idx[k]])

test_accuracy 0.6606701593412213
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
test_precision 0.6632516727505953
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'adam'}
test_f1 0.6480590726364474
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_recall 0.6480590726364474
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_roc_auc 0.6568253283564863
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 10, 'mlp__solver': 'adam'}


In [17]:
# 比较结果
key = 'test_f1'
#print(res["mean_" + key])
idx[key] = np.argmax(res["mean_" + key])
for k in idx:
    print(k, res["mean_" + key][idx[k]])
    print(gs.cv_results_['params'][idx[k]])

test_accuracy 0.6572770694413121
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
test_precision 0.6460568652130256
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'adam'}
test_f1 0.6581527631838345
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_recall 0.6581527631838345
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_roc_auc 0.6540419727667239
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 10, 'mlp__solver': 'adam'}


In [16]:
# 比较结果
key = 'test_recall'
#print(res["mean_" + key])
idx[key] = np.argmax(res["mean_" + key])
for k in idx:
    print(k, res["mean_" + key][idx[k]])
    print(gs.cv_results_['params'][idx[k]])

test_accuracy 0.654031117397454
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
test_precision 0.6299858557284299
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'adam'}
test_f1 0.6685997171145687
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_recall 0.6685997171145687
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_roc_auc 0.6514851485148515
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 10, 'mlp__solver': 'adam'}


In [15]:
# 比较结果
key = 'test_roc_auc'
#print(res["mean_" + key])
idx[key] = np.argmax(res["mean_" + key])
for k in idx:
    print(k, res["mean_" + key][idx[k]])
    print(gs.cv_results_['params'][idx[k]])

test_accuracy 0.7082563158173695
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'sgd'}
test_precision 0.7057545153066792
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 20, 'mlp__solver': 'adam'}
test_f1 0.6992716635471672
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_recall 0.6992716635471672
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 100, 'mlp__solver': 'adam'}
test_roc_auc 0.7082985096032433
{'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__hidden_layer_sizes': 10, 'mlp__solver': 'adam'}
