In [None]:
!pip install optuna

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import joblib

# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Prepare training data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Prepare testing data
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Define the objective function for BaggingClassifier with GradientBoostingClassifier
def bagging_gb_objective(trial):
    # Define hyperparameters for GradientBoostingClassifier
    gb_n_estimators = trial.suggest_int('gb_n_estimators', 50, 200)
    gb_max_depth = trial.suggest_int('gb_max_depth', 1, 10)

    # Define hyperparameters for BaggingClassifier
    bagging_n_estimators = trial.suggest_int('bagging_n_estimators', 10, 100)
    bagging_max_samples = trial.suggest_float('bagging_max_samples', 0.1, 1.0)

    # Create the base classifier with suggested hyperparameters
    base_classifier = GradientBoostingClassifier(
        n_estimators=gb_n_estimators,
        max_depth=gb_max_depth,
        random_state=0
    )

    # Create the bagging classifier with the base classifier
    bagging_classifier = BaggingClassifier(
        base_classifier,
        n_estimators=bagging_n_estimators,
        max_samples=bagging_max_samples,
        random_state=0
    )

    # Define cross-validation strategy (StratifiedKFold for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Perform cross-validation and get mean AUC
    auc_scores = cross_val_score(bagging_classifier, X_train, Y_train, cv=cv, scoring='roc_auc')
    mean_auc = auc_scores.mean()

    return mean_auc

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(bagging_gb_objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best Hyperparameters for BaggingClassifier + GradientBoostingClassifier: {best_params}')

# Create the base classifier with the best hyperparameters for GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(
    n_estimators=best_params['gb_n_estimators'],
    max_depth=best_params['gb_max_depth'],
    random_state=0
)

# Create the bagging classifier with the best hyperparameters
bagging_classifier = BaggingClassifier(
    gb_classifier,
    n_estimators=best_params['bagging_n_estimators'],
    max_samples=best_params['bagging_max_samples'],
    random_state=0
)

# Train the bagging classifier
bagging_classifier.fit(X_train, Y_train)

# Predictions on training set
y_pred_train = bagging_classifier.predict(X_train)

# Calculate AUC for training set
auc_train = roc_auc_score(Y_train, y_pred_train)
print(f'AUC for Training Set: {auc_train:.4f}')

# Predictions on testing set
y_pred_test = bagging_classifier.predict(X_test)

# Calculate AUC for testing set
auc_test = roc_auc_score(Y_test, y_pred_test)
print(f'AUC for Testing Set: {auc_test:.4f}')

# Save the trained model to a file
joblib.dump(bagging_classifier, 'bagging_gb_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('bagging_gb_model.joblib')
prob = loaded_model.predict_proba(X)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)


[I 2023-12-05 15:39:25,390] A new study created in memory with name: no-name-90610055-4ea8-4ca4-8451-ee20ff08798c
[I 2023-12-05 15:44:43,424] Trial 0 finished with value: 0.7902285902503294 and parameters: {'gb_n_estimators': 171, 'gb_max_depth': 9, 'bagging_n_estimators': 17, 'bagging_max_samples': 0.6682231118470534}. Best is trial 0 with value: 0.7902285902503294.
[I 2023-12-05 15:46:08,914] Trial 1 finished with value: 0.8338201581027669 and parameters: {'gb_n_estimators': 187, 'gb_max_depth': 3, 'bagging_n_estimators': 12, 'bagging_max_samples': 0.4989312535225958}. Best is trial 1 with value: 0.8338201581027669.
[I 2023-12-05 15:52:07,993] Trial 2 finished with value: 0.8273776899429073 and parameters: {'gb_n_estimators': 185, 'gb_max_depth': 4, 'bagging_n_estimators': 35, 'bagging_max_samples': 0.608598170164457}. Best is trial 1 with value: 0.8338201581027669.
[I 2023-12-05 16:13:09,231] Trial 3 finished with value: 0.8287110232762405 and parameters: {'gb_n_estimators': 141, 'g

Best Hyperparameters for BaggingClassifier + GradientBoostingClassifier: {'gb_n_estimators': 62, 'gb_max_depth': 1, 'bagging_n_estimators': 43, 'bagging_max_samples': 0.2447437189300218}
AUC for Training Set: 0.6176
AUC for Testing Set: 0.5440
