In [None]:
!pip install optuna
!pip install boruta

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import ADASYN
from boruta import BorutaPy
import optuna
import joblib

# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Split data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Apply ADASYN
adasyn = ADASYN(random_state=0)
X_resampled, Y_resampled = adasyn.fit_resample(X, Y)

# Apply Boruta for feature selection
rf = RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced', random_state=0)
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)
boruta_selector.fit(X_resampled.values, Y_resampled.values)

# Get selected features
selected_features = X_resampled.columns[boruta_selector.support_]

# Use only the selected features
X_resampled_selected = X_resampled[selected_features]
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled_selected, Y_resampled, test_size=0.2, random_state=0)

X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Define the objective function for StackingClassifier
def stacking_objective(trial):
    # Define hyperparameters for RandomForestClassifier
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 500)
    rf_max_depth = trial.suggest_int('rf_max_depth', 1, 50)

    # Define hyperparameters for GradientBoostingClassifier
    gb_n_estimators = trial.suggest_int('gb_n_estimators', 10, 500)
    gb_learning_rate = trial.suggest_float('gb_learning_rate', 0.001, 0.1)
    gb_max_depth = trial.suggest_int('gb_max_depth', 1, 50)

    # Create the base classifiers with suggested hyperparameters
    rf_classifier = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=0)
    gb_classifier = GradientBoostingClassifier(n_estimators=gb_n_estimators, learning_rate=gb_learning_rate, max_depth=gb_max_depth, random_state=0)

    # Create the stacking classifier with the base classifiers
    stacking_classifier = StackingClassifier(
        estimators=[('random_forest', rf_classifier), ('gradient_boosting', gb_classifier)],
        final_estimator=RandomForestClassifier(random_state=0)  # You can choose a different final estimator if needed
    )

    # Define cross-validation strategy (StratifiedKFold for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Perform cross-validation and get mean AUC
    auc_scores = cross_val_score(stacking_classifier, X_train, Y_train, cv=cv, scoring='roc_auc')
    mean_auc = auc_scores.mean()

    return mean_auc

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(stacking_objective, n_trials=5)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best Hyperparameters for StackingClassifier: {best_params}')

# Create the stacking classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'], random_state=0)
gb_classifier = GradientBoostingClassifier(n_estimators=best_params['gb_n_estimators'], learning_rate=best_params['gb_learning_rate'], max_depth=best_params['gb_max_depth'], random_state=0)

stacking_classifier = StackingClassifier(
    estimators=[('random_forest', rf_classifier), ('gradient_boosting', gb_classifier)],
    final_estimator=RandomForestClassifier(random_state=0)  # You can choose a different final estimator if needed
)

# Train the stacking classifier
stacking_classifier.fit(X_train, Y_train)

# Predictions on training set
y_pred_train = stacking_classifier.predict(X_train)

# Calculate AUC for training set
auc_train = roc_auc_score(Y_train, y_pred_train)
print(f'AUC for Training Set: {auc_train:.4f}')

# Predictions on testing set
y_pred_test = stacking_classifier.predict(X_test)

# Calculate AUC for testing set
auc_test = roc_auc_score(Y_test, y_pred_test)
print(f'AUC for Testing Set: {auc_test:.4f}')

# Save the trained model to a file
joblib.dump(stacking_classifier, 'stacking_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('stacking_model.joblib')
prob = loaded_model.predict_proba(X)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0


[I 2023-12-05 17:04:45,056] A new study created in memory with name: no-name-7023aa3a-6466-4f37-a06d-084f1336f2fa


Iteration: 	8 / 100
Confirmed: 	40
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	40
Tentative: 	0
Rejected: 	0


[I 2023-12-05 17:12:15,026] Trial 0 finished with value: 0.9916897051832597 and parameters: {'rf_n_estimators': 29, 'rf_max_depth': 26, 'gb_n_estimators': 438, 'gb_learning_rate': 0.09517065145887017, 'gb_max_depth': 37}. Best is trial 0 with value: 0.9916897051832597.
[I 2023-12-05 17:22:58,409] Trial 1 finished with value: 0.9830172480861148 and parameters: {'rf_n_estimators': 473, 'rf_max_depth': 9, 'gb_n_estimators': 409, 'gb_learning_rate': 0.08252905954878653, 'gb_max_depth': 49}. Best is trial 0 with value: 0.9916897051832597.
[I 2023-12-05 17:38:07,251] Trial 2 finished with value: 0.9899224584397247 and parameters: {'rf_n_estimators': 492, 'rf_max_depth': 19, 'gb_n_estimators': 317, 'gb_learning_rate': 0.030938403652109026, 'gb_max_depth': 48}. Best is trial 0 with value: 0.9916897051832597.
[I 2023-12-05 17:54:11,379] Trial 3 finished with value: 0.9940575579280294 and parameters: {'rf_n_estimators': 454, 'rf_max_depth': 30, 'gb_n_estimators': 457, 'gb_learning_rate': 0.05030

Best Hyperparameters for StackingClassifier: {'rf_n_estimators': 454, 'rf_max_depth': 30, 'gb_n_estimators': 457, 'gb_learning_rate': 0.050300271823392376, 'gb_max_depth': 25}
AUC for Training Set: 1.0000
AUC for Testing Set: 0.9698
