In [1]:
import numpy as np
import pandas as pd
from data import read_data, removing_duplicates
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from itertools import product
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from sklearn.exceptions import ConvergenceWarning

# 忽略警告消息
warnings.filterwarnings("ignore", category=ConvergenceWarning)


### read data

In [2]:
f = "./data/1-s2.0-S1364032122009844-mmc1.xlsx"
df = read_data(f)
df = removing_duplicates(df)
df

Unnamed: 0,Brine,%wt of salt,CaBr2,CaCl2,K2CO3,KBr,KCl,KHCOO,MgBr2,MgCl2,...,Ca⁺⁺,Mg⁺⁺,Cl⁻,Br⁻,I⁻,HCOO⁻,CO₃⁻⁻,SO₄⁻⁻,P (MPa),T (K)
0,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,2.69,268.30
1,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,3.53,271.05
2,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,4.50,273.25
3,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,5.29,274.75
4,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,5.98,275.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,6.14,278.80
983,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,6.88,279.80
984,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,7.80,280.90
985,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,8.27,281.50


In [3]:
subset_columns = df.columns[:30]
subset_columns

Index(['Brine', '%wt of salt', 'CaBr2', 'CaCl2', 'K2CO3', 'KBr', 'KCl',
       'KHCOO', 'MgBr2', 'MgCl2', 'Na2SO4', 'NaBr', 'NaCl', 'NaHCOO', 'NaI',
       'NH4Cl', 'ZnBr2', 'Na⁺', 'K⁺', 'NH₄⁺', 'Zn⁺', 'Ca⁺⁺', 'Mg⁺⁺', 'Cl⁻',
       'Br⁻', 'I⁻', 'HCOO⁻', 'CO₃⁻⁻', 'SO₄⁻⁻', 'P (MPa)'],
      dtype='object')

In [4]:
duplicate_mask = df.duplicated(subset=subset_columns, keep='first')
df = df[~duplicate_mask]
df

Unnamed: 0,Brine,%wt of salt,CaBr2,CaCl2,K2CO3,KBr,KCl,KHCOO,MgBr2,MgCl2,...,Ca⁺⁺,Mg⁺⁺,Cl⁻,Br⁻,I⁻,HCOO⁻,CO₃⁻⁻,SO₄⁻⁻,P (MPa),T (K)
0,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,2.69,268.30
1,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,3.53,271.05
2,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,4.50,273.25
3,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,5.29,274.75
4,NaCl,11.731824,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.03936,0.000000,0.0,0.0,0.0,0.0,5.98,275.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,6.14,278.80
983,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,6.88,279.80
984,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,7.80,280.90
985,ZnBr2,15,0.0,0.0,0,0,0.0,0,0.0,0.0,...,0.0,0.0,0.00000,0.027843,0.0,0.0,0.0,0.0,8.27,281.50


### spliting additional testing data

In [5]:
df_new = df[df.iloc[:, 0] != 'MgBr2']
df_mgbr2 = df[df.iloc[:, 0] == 'MgBr2']

### features and target selection

In [6]:
select_columns = ['Na⁺','K⁺', 'NH₄⁺', 'Zn⁺', 'Ca⁺⁺', 
             'Mg⁺⁺', 'Cl⁻', 'Br⁻', 'I⁻', 'HCOO⁻',
             'CO₃⁻⁻', 'SO₄⁻⁻', 'P (MPa)','T (K)']
df_new = df_new[select_columns]
df_mgbr2 = df_mgbr2[select_columns]

X = df_new[select_columns[:-1]]
y = df_new[select_columns[-1]]

X_mgbr2 = df_mgbr2[select_columns[:-1]]
y_mgbr2 = df_mgbr2[select_columns[-1]]
X,y

(         Na⁺   K⁺  NH₄⁺       Zn⁺  Ca⁺⁺  Mg⁺⁺      Cl⁻       Br⁻   I⁻  HCOO⁻  \
 0    0.03936  0.0   0.0  0.000000   0.0   0.0  0.03936  0.000000  0.0    0.0   
 1    0.03936  0.0   0.0  0.000000   0.0   0.0  0.03936  0.000000  0.0    0.0   
 2    0.03936  0.0   0.0  0.000000   0.0   0.0  0.03936  0.000000  0.0    0.0   
 3    0.03936  0.0   0.0  0.000000   0.0   0.0  0.03936  0.000000  0.0    0.0   
 4    0.03936  0.0   0.0  0.000000   0.0   0.0  0.03936  0.000000  0.0    0.0   
 ..       ...  ...   ...       ...   ...   ...      ...       ...  ...    ...   
 982  0.00000  0.0   0.0  0.013921   0.0   0.0  0.00000  0.027843  0.0    0.0   
 983  0.00000  0.0   0.0  0.013921   0.0   0.0  0.00000  0.027843  0.0    0.0   
 984  0.00000  0.0   0.0  0.013921   0.0   0.0  0.00000  0.027843  0.0    0.0   
 985  0.00000  0.0   0.0  0.013921   0.0   0.0  0.00000  0.027843  0.0    0.0   
 986  0.00000  0.0   0.0  0.013921   0.0   0.0  0.00000  0.027843  0.0    0.0   
 
      CO₃⁻⁻  SO₄⁻⁻  P (MPa

### Data splitting

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=22)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((749, 13), (749,), (188, 13), (188,))

### A 5-fold Cross-Validation (CV) 

In [8]:
# import warnings
# from sklearn.exceptions import ConvergenceWarning

# # 忽略警告消息
# warnings.filterwarnings("ignore", category=ConvergenceWarning)

# # 定义MLP回归器
# mlp = MLPRegressor()
# # 定义参数网格
# param_grid = {
#     'hidden_layer_sizes': [(n,) for n in range(1, 11)] + [(n, m) for n in range(1, 11) for m in range(1, 11)],
#     # 'activation': ['relu', 'tanh', 'logistic'],
#     # 'solver': ['adam', 'sgd'],
#     # 'learning_rate_init': [0.001, 0.01, 0.1],  # 初始学习率
# }

# # 创建GridSearchCV对象
# grid_search = GridSearchCV(mlp, param_grid, cv=5,scoring='neg_root_mean_squared_error')
# grid_search.fit(X, y)
# # 打印最佳参数
# print("Best parameters found:",grid_search.best_params_)
# print("Best parameters score:",grid_search.best_score_)

# # # 创建RandomizedSearchCV对象
# # random_search = RandomizedSearchCV(mlp, param_grid, n_iter=20,cv=5,scoring='neg_root_mean_squared_error')
# # random_search.fit(X, y)
# # # 打印最佳参数
# # print("Best parameters found:",random_search.best_params_)
# # print("Best parameters score:",random_search.best_score_)

# # # 创建RandomizedSearchCV对象
# # bayes_search = BayesSearchCV(mlp, param_grid, n_iter=20,cv=5,scoring='neg_root_mean_squared_error')
# # bayes_search.fit(X, y)
# # # 打印最佳参数
# # print("Best parameters found:",bayes_search.best_params_)
# # print("Best parameters score:",bayes_search.best_score_)



### 缩放数据

In [9]:
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
x_train

array([[-0.66999636, -0.34838334, -0.15562204, ..., -0.09107537,
        -0.11632626, -0.39502518],
       [-0.66999636,  3.49911017, -0.15562204, ..., -0.09107537,
        -0.11632626,  0.54420475],
       [-0.66999636, -0.34838334, -0.15562204, ..., -0.09107537,
        -0.11632626, -0.42731389],
       ...,
       [-0.29724053, -0.34838334, -0.15562204, ..., -0.09107537,
        -0.11632626, -0.53932427],
       [ 0.63715809, -0.34838334, -0.15562204, ..., -0.09107537,
        -0.11632626, -0.24186812],
       [-0.66999636, -0.34838334, -0.15562204, ..., -0.09107537,
        -0.11632626, -0.42102759]])

### 训练

In [None]:
# 定义MLP回归器
mlp = MLPRegressor()

# 进行20次试验
num_trials = 20
rmse_scores = []

for i in range(num_trials):
    # 拟合模型
    mlp.fit(X, y)
    # 使用5折交叉验证计算RMSE
    scores = cross_val_score(mlp, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    print(f"{i}----{rmse}")
    rmse_scores.append(np.sqrt(-scores.mean()))

# 打印每次试验的RMSE
print("RMSE from 20 trials:")
for i, rmse in enumerate(rmse_scores, 1):
    print(f"Trial {i}: {rmse}")

# 计算平均RMSE
avg_rmse = np.mean(rmse_scores)
print(f"\nAverage RMSE across 20 trials: {avg_rmse}")

0----269.594497991081
1----272.2410724710554
