In [7]:
import pandas as pd

train = pd.read_excel("../../Dataset/train_data.xlsx", index_col=False)
test = pd.read_excel("../../Dataset/test_data.xlsx", index_col=False)
out_test = pd.read_excel("../../Dataset/data_out_feats.xlsx", index_col=False)

X_train = train.iloc[:, 6:]
y_train_D = train["D"]
y_train_P = train["P"]
y_train_H = train["H"]

X_test = test.iloc[:, 6:]
y_test_D = test["D"]
y_test_P = test["P"]
y_test_H = test["H"]

X_out = out_test.iloc[:, 6:]
y_out_D = out_test["D"]
y_out_P = out_test["P"]
y_out_H = out_test["H"]

In [8]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse**0.5
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return r2, mae, rmse

In [9]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

import warnings

warnings.filterwarnings("ignore")


def l1_feature_selection_with_cv(X_train, y_train, X_test, y_test, X_out, y_out, cv=5):
    """
    使用L1正则化回归和交叉验证选择最优特征，并输出最佳参数和模型精度。

    参数:
    - X_train: 训练集特征
    - y_train: 训练集目标值
    - X_test: 测试集特征
    - y_test: 测试集目标值
    - cv: 交叉验证折数

    返回:
    - selected_features: 被选择的特征索引
    - best_alpha: 最佳正则化参数
    - model_accuracy: 模型在测试集上的精度
    """
    # 使用LassoCV进行交叉验证以找到最佳alpha
    lasso_cv = LassoCV(cv=cv, random_state=42)
    lasso_cv.fit(X_train, y_train)

    # 获取最佳alpha值
    best_alpha = lasso_cv.alpha_
    print(f"Best alpha: {best_alpha}")

    # 使用最佳alpha值重新训练Lasso模型
    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train, y_train)

    # 使用SelectFromModel选择重要特征
    selector = SelectFromModel(lasso, prefit=True, threshold="mean")
    selected_features = selector.get_support(indices=True)
    print(f"Selected Features: {selected_features}")

    # 输出被选择特征的系数
    selected_coefficients = lasso.coef_[selected_features]
    print(f"Coefficients of Selected Features: {selected_coefficients}")

    # 使用选择的特征训练模型
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]
    X_out_selected = X_out.iloc[:, selected_features]
    lasso.fit(X_train_selected, y_train)

    train_y_pred = lasso.predict(X_train_selected)
    test_y_pred = lasso.predict(X_test_selected)
    out_y_pred = lasso.predict(X_out_selected)
    # 计算模型在测试集上的精度
    train_r2, train_mae, train_rmse = evaluate_model(y_train, train_y_pred)
    test_r2, test_mae, test_rmse = evaluate_model(y_test, test_y_pred)
    out_r2, out_mae, out_rmse = evaluate_model(y_out, out_y_pred)

    train_results = pd.DataFrame({"y_train": y_train, "y_train_pred": train_y_pred})
    test_results = pd.DataFrame({"y_test": y_test, "y_test_pred": test_y_pred})
    out_results = pd.DataFrame({"y_out": y_out, "y_out_pred": out_y_pred})

    print(f"train R2: {train_r2}, train MAE: {train_mae}, train RMSE: {train_rmse}")
    print(f"test R2: {test_r2}, test MAE: {test_mae}, test RMSE: {test_rmse}")
    print(f"out R2: {out_r2}, out MAE: {out_mae}, out RMSE: {out_rmse}")

    return (
        selected_features,
        selected_coefficients,
        best_alpha,
        train_results,
        test_results,
        out_results,
    )

In [None]:
(
    selected_features,
    selected_coefficients,
    best_alpha,
    train_results,
    test_results,
    out_results,

) = l1_feature_selection_with_cv(X_train, y_train_D, X_test, y_test_D, X_out, y_out_D)

train_results.to_excel(
    "./results/Lasso/train_results_D.xlsx", index=False, header=True

)

test_results.to_excel(

    "./results/Lasso/test_results_D.xlsx", index=False, header=True

)

out_results.to_excel("./results/Lasso/out_results_D.xlsx", index=False, header=True)


feature_coefficients = pd.DataFrame(
    {
        "Feature": X_train.columns[selected_features],
        "Coefficient": selected_coefficients,
    }
)
feature_coefficients.to_excel(
    "./results/Lasso/selected_features_coefficients_D.xlsx",
    index=False,
    header=True,
)

Best alpha: 0.0012204903411801365
Selected Features: [ 6 12 13 17 19 20 25 26 28 32 33 45 46 48 50 54 55 58 60 62 71 72 73 77
 78 79 81 83 85 86 89 92 93]
Coefficients of Selected Features: [-0.31611262 -0.44218375  0.34854309 -0.4022368   0.40412775 -0.31305272
 -1.64044945  0.39752852 -0.41984881  1.10571709 -0.4692407  -0.27037679
  0.6490468   0.24841222  0.55792424  0.67695763  0.31765116  0.29548998
  0.32261685  0.22779793  0.37981869 -0.2512866  -0.37326023 -0.56415672
 -0.46805227 -0.57132835 -0.2441545   1.00011683  0.58648943  0.65228033
 -1.18457396  0.63975824 -0.61247576]
train R2: 0.8078441754753761, train MAE: 0.6011670328633961, train RMSE: 0.8151494204693296
test R2: 0.7534225222243127, test MAE: 0.6086407417363023, test RMSE: 0.8878491619717246
out R2: 0.3386652098357966, out MAE: 1.0049193138389303, out RMSE: 1.1477683650133428


In [None]:
(
    selected_features,
    selected_coefficients,
    best_alpha,
    train_results,
    test_results,
    out_results,

) = l1_feature_selection_with_cv(X_train, y_train_P, X_test, y_test_P, X_out, y_out_P)

train_results.to_excel(
    "./results/Lasso/train_results_P.xlsx", index=False, header=True

)

test_results.to_excel(

    "./results/Lasso/test_results_P.xlsx", index=False, header=True

)

out_results.to_excel("./results/Lasso/out_results_P.xlsx", index=False, header=True)


feature_coefficients = pd.DataFrame(
    {
        "Feature": X_train.columns[selected_features],
        "Coefficient": selected_coefficients,
    }
)
feature_coefficients.to_excel(
    "./results/Lasso/selected_features_coefficients_P.xlsx",
    index=False,
    header=True,
)

Best alpha: 0.05142123287409986
Selected Features: [ 0 11 12 18 19 21 30 34 35 36 38 39 41 43 44 45 49 60 62 67 74 79 80 85
 92 96]
Coefficients of Selected Features: [ 1.20698754  0.2466172   0.78864278  0.16123692 -0.45825951  0.39159757
 -0.43255517  0.23714512  0.22656345 -0.53290348  0.26454191  0.49268141
 -0.28966727  0.4648294   0.45631478 -0.9838408   0.35236097  1.65276215
 -0.52450722  0.17284749 -0.16350044 -1.03906181 -0.7257738  -0.25349884
  0.20767455 -0.21025016]
train R2: 0.5780033085485334, train MAE: 2.1287409782415745, train RMSE: 2.816726570596336
test R2: 0.5097697012343806, test MAE: 2.0704019996576295, test RMSE: 2.9079514587407247
out R2: 0.5808903100142653, out MAE: 2.378884532945774, out RMSE: 3.404547699169001


In [None]:
(
    selected_features,
    selected_coefficients,
    best_alpha,
    train_results,
    test_results,
    out_results,

) = l1_feature_selection_with_cv(X_train, y_train_H, X_test, y_test_H, X_out, y_out_H)

train_results.to_excel(
    "./results/Lasso/train_results_H.xlsx", index=False, header=True

)

test_results.to_excel(

    "./results/Lasso/test_results_H.xlsx", index=False, header=True

)

out_results.to_excel("./results/Lasso/out_results_H.xlsx", index=False, header=True)


feature_coefficients = pd.DataFrame(
    {
        "Feature": X_train.columns[selected_features],
        "Coefficient": selected_coefficients,
    }
)
feature_coefficients.to_excel(
    "../../Results/Lasso/selected_features_coefficients_H.xlsx",
    index=False,
    header=True,
)

Best alpha: 0.060558698179302164
Selected Features: [ 0  8  9 10 15 19 22 24 30 42 47 52 57 61 62 79 87 88 90 93 98]
Coefficients of Selected Features: [ 0.38438717 -0.84506879 -1.23240372  0.20898222  0.26521803 -0.62872808
 -0.24262586  0.59168151 -1.90032622 -0.19473785  0.15916315 -0.35762769
 -0.25659667 -0.28489396 -0.54666604 -0.70064359  0.94339126  3.33376038
  0.46541222 -0.45147241  0.6190274 ]
train R2: 0.7687902906673045, train MAE: 1.9272656751257544, train RMSE: 2.799799331612
test R2: 0.7477631605715674, test MAE: 1.6721973959971608, test RMSE: 2.235307372487668
out R2: 0.833532233208437, out MAE: 2.272596645277878, out RMSE: 3.0060354081007676
