In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

def load_data():
    """load data"""
    X_train = pd.read_csv('C:/Users/Admin/Desktop/NN_Assignment2/data/X_train.csv')
    X_test = pd.read_csv('C:/Users/Admin/Desktop/NN_Assignment2/data/X_test.csv')
    y_train = pd.read_csv('C:/Users/Admin/Desktop/NN_Assignment2/data/y_train.csv')
    y_test = pd.read_csv('C:/Users/Admin/Desktop/NN_Assignment2/data/y_test.csv')
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_data()
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#LOGISTIC REGRESSION
# param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
# log_reg = LogisticRegression(class_weight='balanced')
# grid = GridSearchCV(log_reg, param_grid, cv=5)
# grid.fit(X_train, y_train)
# print(grid.best_params_)


#NEURAL NETWORK
# param_grid = {
#     'hidden_layer_sizes': [(50,), (100,), (100, 50)],
#     'learning_rate_init': [0.01, 0.001, 0.0001],
# }
# mlp = MLPRegressor(max_iter=2000)
# grid_search = GridSearchCV(mlp, param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)


# FUSION
base_models = [
    ("lr", LinearRegression()),  
    ("rf", RandomForestRegressor(random_state=42)),
    ("svr", SVR()),
    ("mlp", MLPRegressor(random_state=42))
]

# 定义堆叠模型
stacking_model = StackingRegressor(estimators=base_models, final_estimator=RandomForestRegressor(random_state=42))

# 定义参数网格，用于调优基础模型和堆叠模型
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [10, 20, None],
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.1, 0.2],
    'mlp__hidden_layer_sizes': [(100,), (100, 50)],
    'mlp__learning_rate_init': [0.001, 0.01]
}

# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=stacking_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# 输出最佳参数组合
print(f"Best parameters found: {grid_search.best_params_}")

# 使用最佳参数组合评估模型
best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test)
print(f"Test set score: {score:.4f}")


# output
# Best parameters found: {'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate_init': 0.001, 'rf__max_depth': 10, 'rf__n_estimators': 200, 'svr__C': 10, 'svr__epsilon': 0.1}

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  y = column_or_1d(y, warn=True)


Best parameters found: {'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate_init': 0.001, 'rf__max_depth': None, 'rf__n_estimators': 200, 'svr__C': 10, 'svr__epsilon': 0.1}
Test set score: 0.5043
