In [1]:
# 引用用到的文件
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.cluster import KMeans

print("引用文件成功")

引用文件成功


In [2]:
# 加载数据
data = pd.read_excel('./data/cleaned_data.xlsx', sheet_name='Sheet1')

In [3]:
# 加载数据和标签
X = data.drop(columns=["label"])
Y = data["label"]

In [4]:
# 将标签转化为整型
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(Y)

In [5]:
# 将数据变为标准差
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

In [6]:
# 划分数据集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, stratify=y)

In [7]:
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel

# 特征选择（基于 XGBoost 特征重要性）
xgb_for_feature_selection = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=6,
    learning_rate=0.1,
    n_estimators=800
)
selector = SelectFromModel(xgb_for_feature_selection, threshold='median')
X_train_selected = selector.fit_transform(X_train, y_train)

# 数据增强（处理类别不平衡）
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)

# 定义优化后的基模型
base_learners = [
    ('xgb', XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        max_depth=5,  # 降低深度
        learning_rate=0.05,  # 降低学习率
        n_estimators=1000,  # 增加树的数量
        min_child_weight=3,
        subsample=0.7,
        colsample_bytree=0.7,
        gamma=0.2,  # 增加正则化
        alpha=0.1,  # 增加 L1 正则化
        reg_lambda=2  # 增加 L2 正则化
    )),
    ('rf', RandomForestClassifier(
        n_estimators=300,  # 增加树的数量
        max_depth=8,  # 限制树的深度
        min_samples_split=5,  # 调整最小样本数
        min_samples_leaf=3,
        max_features='log2',  # 调整特征选择
        class_weight='balanced'
    ))
]

# 定义元学习器（使用逻辑回归）
meta_learner = LogisticRegression()

# 使用 StackingClassifier 构建模型
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)

# 交叉验证
scores = cross_val_score(stacking_model, X_train_scaled, y_train_resampled, cv=5, scoring='accuracy')
print(f"交叉验证准确率: {scores.mean():.4f} ± {scores.std():.4f}")



交叉验证准确率: 0.8921 ± 0.0569


In [8]:
# 训练模型
eval_set = [(X_test, y_test)]
stacking_model.fit(X_train, y_train)


In [9]:
# 获取基模型的特征重要性
base_models = stacking_model.estimators_
feature_importances = []

for model in base_models:
    if hasattr(model, 'feature_importances_'):
        feature_importances.append(model.feature_importances_)
    else:
        print(f"Model does not have feature_importances_")

# 如果有多个基模型，计算平均特征重要性
if feature_importances:
    avg_feature_importances = np.mean(feature_importances, axis=0)
else:
    print("No feature_importances_ available from the base models.")
    avg_feature_importances = None

# 获取特征名称
feature_names = X.columns

# 获取top10并进行打印
if avg_feature_importances is not None:
    top_features_idx = avg_feature_importances.argsort()[-10:][::-1]
    top_feature_name = [feature_names[i] for i in top_features_idx]
    print("Top 10 Features:", top_feature_name)
else:
    print("Cannot compute top features due to missing feature_importances_.")


Top 10 Features: ['Gender', '583.1725__499.78', '654.3022__36.15', '697.0139__495.82', '514.3122__240.75', '654.3022__39.86', '171.0068__211.2', '533.0414__35.16', '659.5677__495.82', '721.0651__495.08']


In [10]:
# 预测训练集和测试集
y_train_pred = stacking_model.predict(X_train)
y_test_pred = stacking_model.predict(X_test)

# 计算准确率
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"训练集准确率: {train_accuracy:.4f}")
print(f"测试集准确率: {test_accuracy:.4f}")

训练集准确率: 1.0000
测试集准确率: 0.8936
