In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel('hechuan ube.xlsx')

# Split the data into features (X) and target variable (Y)
X = data.drop(['ID', 'los', 'elos'], axis=1)
Y = data['elos']

# Convert Y to binary classification (if necessary)
Y = (Y > Y.median()).astype(int)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# 数据增强（SMOTE）
smote = SMOTE(random_state=42)
X_train_res, Y_train_res = smote.fit_resample(X_train, Y_train)

# 特征标准化
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# Function to plot ROC curve
def plot_roc_curve(fpr, tpr, roc_auc, label):
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

# Initialize a plot
plt.figure(figsize=(10, 8))

# Performance metrics storage
metrics = {}

# Function to calculate and store metrics
def calculate_metrics(y_true, y_pred, label):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    metrics[label] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# 1. XGBoost (abbreviation: XGB)
xgb_model = XGBClassifier(
    random_state=42,
    n_estimators=180,
    max_depth=2,  # 降低树的深度以减少过拟化
    reg_alpha=8,  # 增加正则化
    reg_lambda=8,  # 增加正则化
    min_child_weight=3,  # 增加最小子节点权重以减少过拟化
    gamma=1.0,  # 增加 gamma 以减少过拟化
    subsample=0.8,  # 减少采样率以增加随机性
    colsample_bytree=0.8  # 减少特征采样率以增加随机性
)
xgb_model.fit(X_train_res, Y_train_res)
Y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
# 调整分类阈值
Y_pred_xgb = (Y_pred_proba_xgb > 0.8).astype(int)
calculate_metrics(Y_test, Y_pred_xgb, 'XGB')
fpr_xgb, tpr_xgb, _ = roc_curve(Y_test, Y_pred_proba_xgb)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
plot_roc_curve(fpr_xgb, tpr_xgb, roc_auc_xgb, 'XGB')

# 2. Gradient Boosting Machine (abbreviation: GBM)
gbm_model = GradientBoostingClassifier(random_state=42, learning_rate=0.1)
gbm_model.fit(X_train_res, Y_train_res)
Y_pred_proba_gbm = gbm_model.predict_proba(X_test)[:, 1]
Y_pred_gbm = (Y_pred_proba_gbm > 0.8).astype(int)
calculate_metrics(Y_test, Y_pred_gbm, 'GBM')
fpr_gbm, tpr_gbm, _ = roc_curve(Y_test, Y_pred_proba_gbm)
roc_auc_gbm = auc(fpr_gbm, tpr_gbm)
plot_roc_curve(fpr_gbm, tpr_gbm, roc_auc_gbm, 'GBM')

# 3. Random Forest (abbreviation: RF)
rf_model = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=5, min_samples_split=10)
rf_model.fit(X_train_res, Y_train_res)
Y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]
Y_pred_rf = (Y_pred_proba_rf > 0.8).astype(int)
calculate_metrics(Y_test, Y_pred_rf, 'RF')
fpr_rf, tpr_rf, _ = roc_curve(Y_test, Y_pred_proba_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plot_roc_curve(fpr_rf, tpr_rf, roc_auc_rf, 'RF')

# 4. Logistic Regression (abbreviation: LR)
lr_model = LogisticRegression(random_state=42, penalty='l2', C=0.1)
lr_model.fit(X_train_res, Y_train_res)
Y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]
Y_pred_lr = (Y_pred_proba_lr > 0.5).astype(int)
calculate_metrics(Y_test, Y_pred_lr, 'LR')
fpr_lr, tpr_lr, _ = roc_curve(Y_test, Y_pred_proba_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)
plot_roc_curve(fpr_lr, tpr_lr, roc_auc_lr, 'LR')

# 5. Lasso Logistic Regression (abbreviation: Lasso LR)
lasso_lr_model = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42)
lasso_lr_model.fit(X_train_res, Y_train_res)
Y_pred_proba_lasso_lr = lasso_lr_model.predict_proba(X_test)[:, 1]
Y_pred_lasso_lr = (Y_pred_proba_lasso_lr > 0.5).astype(int)
calculate_metrics(Y_test, Y_pred_lasso_lr, 'Lasso LR')
fpr_lasso_lr, tpr_lasso_lr, _ = roc_curve(Y_test, Y_pred_proba_lasso_lr)
roc_auc_lasso_lr = auc(fpr_lasso_lr, tpr_lasso_lr)
plot_roc_curve(fpr_lasso_lr, tpr_lasso_lr, roc_auc_lasso_lr, 'Lasso LR')

# 6. Convolutional Neural Network (abbreviation: CNN)
# Reshape data for CNN (assuming 1D convolution)
X_train_cnn = X_train_res.reshape((X_train_res.shape[0], X_train_res.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

cnn_model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_res.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, Y_train_res, epochs=10, batch_size=32, verbose=0)
Y_pred_proba_cnn = cnn_model.predict(X_test_cnn).flatten()
Y_pred_cnn = (Y_pred_proba_cnn > 0.5).astype(int)
calculate_metrics(Y_test, Y_pred_cnn, 'CNN')
fpr_cnn, tpr_cnn, _ = roc_curve(Y_test, Y_pred_proba_cnn)
roc_auc_cnn = auc(fpr_cnn, tpr_cnn)
plot_roc_curve(fpr_cnn, tpr_cnn, roc_auc_cnn, 'CNN')

# 7. Deep Neural Network (abbreviation: DNN)
dnn_model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_res.shape[1],), kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
dnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
dnn_model.fit(X_train_res, Y_train_res, epochs=10, batch_size=32, verbose=0)
Y_pred_proba_dnn = dnn_model.predict(X_test).flatten()
Y_pred_dnn = (Y_pred_proba_dnn > 0.5).astype(int)
calculate_metrics(Y_test, Y_pred_dnn, 'DNN')
fpr_dnn, tpr_dnn, _ = roc_curve(Y_test, Y_pred_proba_dnn)
roc_auc_dnn = auc(fpr_dnn, tpr_dnn)
plot_roc_curve(fpr_dnn, tpr_dnn, roc_auc_dnn, 'DNN')

# Plot the ROC curves
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.show()

# Print metrics for each model
for model, metric in metrics.items():
    print(f"{model} Metrics:")
    for key, value in metric.items():
        print(f"  {key}: {value:.4f}")
