In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel('hechuan ube pre 3.xlsx')

# Split the data into features (X) and target variable (Y)
X = data.drop(['ID', 'los', 'elos'], axis=1)
Y = data['elos']

# Convert Y to binary classification (if necessary)
Y = (Y > Y.median()).astype(int)

# 数据增强（SMOTE）
smote = SMOTE(random_state=42)
X_res, Y_res = smote.fit_resample(X, Y)

# 特征标准化
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)

# Define models with adjusted parameters
models = {
    'XGB': XGBClassifier(
        random_state=42,
        n_estimators=10,
        max_depth=3,
        learning_rate=0.05,
        reg_alpha=1.0,
        reg_lambda=1.0,
        subsample=0.5,
        colsample_bytree=0.5
    ),
    'GBM': GradientBoostingClassifier(
        random_state=42,
        n_estimators=5,
        learning_rate=0.001,
        max_depth=1
    ),
    'RF': RandomForestClassifier(
        random_state=42,
        n_estimators=5,
        max_depth=1,
        max_features='sqrt'
    ),
    'LR': LogisticRegression(
        random_state=42,
        penalty='l2',
        C=0.00001
    ),
    'Lasso LR': LogisticRegression(
        penalty='l1',
        solver='liblinear',
        C=0.1,
        random_state=42
    )
}

# Define CNN and DNN models
def create_cnn_model():
    model = Sequential([
        Conv1D(filters=16, kernel_size=3, activation='relu', input_shape=(X_res.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_dnn_model():
    model = Sequential([
        Dense(16, activation='relu', input_shape=(X_res.shape[1],), kernel_regularizer=l2(0.01)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

models['CNN'] = create_cnn_model()
models['DNN'] = create_dnn_model()

# Perform 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
metrics = {
    model_name: {
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': [],
        'AUC': []
    } for model_name in models.keys()
}

for train_index, test_index in skf.split(X_res_scaled, Y_res):
    X_train_fold, X_test_fold = X_res_scaled[train_index], X_res_scaled[test_index]
    Y_train_fold, Y_test_fold = Y_res[train_index], Y_res[test_index]

    # Train models
    for model_name, model in models.items():
        if model_name in ['CNN', 'DNN']:
            X_train_cnn = X_train_fold.reshape((X_train_fold.shape[0], X_train_fold.shape[1], 1))
            X_test_cnn = X_test_fold.reshape((X_test_fold.shape[0], X_test_fold.shape[1], 1))
            model.fit(X_train_cnn, Y_train_fold, epochs=5, batch_size=16, verbose=0)
            Y_pred = model.predict(X_test_cnn).flatten()
        else:
            model.fit(X_train_fold, Y_train_fold)
            Y_pred = model.predict_proba(X_test_fold)[:, 1]

        # Convert probabilities to binary predictions for CNN and DNN
        if model_name in ['CNN', 'DNN']:
            Y_pred_binary = (Y_pred > 0.5).astype(int)
        else:
            Y_pred_binary = model.predict(X_test_fold)

        # Calculate metrics
        accuracy = accuracy_score(Y_test_fold, Y_pred_binary)
        precision = precision_score(Y_test_fold, Y_pred_binary)
        recall = recall_score(Y_test_fold, Y_pred_binary)
        f1 = f1_score(Y_test_fold, Y_pred_binary)
        fpr, tpr, _ = roc_curve(Y_test_fold, Y_pred)
        roc_auc = auc(fpr, tpr)

        # Store metrics
        metrics[model_name]['Accuracy'].append(accuracy)
        metrics[model_name]['Precision'].append(precision)
        metrics[model_name]['Recall'].append(recall)
        metrics[model_name]['F1 Score'].append(f1)
        metrics[model_name]['AUC'].append(roc_auc)

# Calculate average metrics for each model
average_metrics = {
    model_name: {
        metric: np.mean(values) for metric, values in model_metrics.items()
    } for model_name, model_metrics in metrics.items()
}

# Print average metrics for each model
for model_name, model_metrics in average_metrics.items():
    print(f"{model_name} Metrics:")
    for metric, value in model_metrics.items():
        print(f"  {metric}: {value:.4f}")

# ...

# Plot ROC curves for each model
plt.figure(figsize=(10, 8))
for model_name, model in models.items():
    if model_name in ['CNN', 'DNN']:
        X_train_cnn = X_res_scaled.reshape((X_res_scaled.shape[0], X_res_scaled.shape[1], 1))
        X_test_cnn = X_res_scaled.reshape((X_res_scaled.shape[0], X_res_scaled.shape[1], 1))
        model.fit(X_train_cnn, Y_res, epochs=5, batch_size=16, verbose=0)
        Y_pred = model.predict(X_test_cnn).flatten()
    else:
        model.fit(X_res_scaled, Y_res)
        Y_pred = model.predict_proba(X_res_scaled)[:, 1]

    fpr, tpr, _ = roc_curve(Y_res, Y_pred)
    roc_auc = auc(fpr, tpr)
    
    # Store the last iteration's AUC for comparison
    if model_name not in metrics:
        metrics[model_name] = {'AUC': []}
    metrics[model_name]['AUC'].append(roc_auc)

    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.show()

# Print the last iteration's AUC for comparison
for model_name, model_metrics in metrics.items():
    print(f"{model_name} Last Iteration AUC: {model_metrics['AUC'][-1]:.4f}")

