In [1]:
!pip install imbalanced-learn boruta
!pip install optuna

Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: boruta
Successfully installed boruta-0.3
Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing co

In [4]:
# Import necessary libraries and functions
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from boruta import BorutaPy
import joblib

# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Prepare training data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Undersample the majority class
rus = RandomUnderSampler(random_state=0)
X_resampled, Y_resampled = rus.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)

# Prepare testing data
X_real = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Define the objective function for RandomForestClassifier with Boruta feature selection
def random_forest_objective(trial):
    # Define hyperparameters for RandomForestClassifier
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 1000)
    rf_max_depth = trial.suggest_int('rf_max_depth', 1, 100)

    # Create the Random Forest classifier with suggested hyperparameters
    rf_classifier = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=0)

    # Feature selection using Boruta
    boruta_selector = BorutaPy(rf_classifier, n_estimators='auto', verbose=0, random_state=0)
    boruta_selector.fit(X_train.values, Y_train.values)

    # Get selected features
    selected_features = X_train.columns[boruta_selector.support_]

    # Check if any features are selected
    if len(selected_features) == 0:
        print("No features selected. Skipping further processing.")
        return 0

    # Use only selected features for training and testing
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    X_real_selected = X_real[selected_features]

    # Define cross-validation strategy (StratifiedKFold for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Perform cross-validation and get mean AUC
    auc_scores = cross_val_score(rf_classifier, X_train_selected, Y_train, cv=cv, scoring='roc_auc')
    mean_auc = auc_scores.mean()

    return mean_auc

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(random_forest_objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best Hyperparameters for RandomForestClassifier: {best_params}')

# Create the Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'], random_state=0)

# Feature selection using Boruta with the best hyperparameters
boruta_selector = BorutaPy(rf_classifier, n_estimators='auto', verbose=2, random_state=0)
boruta_selector.fit(X_train.values, Y_train.values)

# Get selected features
selected_features = X_train.columns[boruta_selector.support_]

# Check if any features are selected
if len(selected_features) == 0:
    print("No features selected. Skipping further processing.")
else:
    # Use only selected features for training and testing
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    X_real_selected = X_real[selected_features]

    # Train the Random Forest classifier
    rf_classifier.fit(X_train_selected, Y_train)

    # Predictions on testing set
    y_pred_test = rf_classifier.predict(X_test_selected)

    # Calculate AUC for testing set
    auc_test = roc_auc_score(Y_test, y_pred_test)
    print(f'AUC for Testing Set: {auc_test:.4f}')

    # Save the trained model to a file
    joblib.dump(rf_classifier, 'random_forest_model.joblib')

    # Load model and generate probability CSV
    loaded_model = joblib.load('random_forest_model.joblib')
    prob = loaded_model.predict_proba(X_real_selected)[:, 1]
    prob_df = pd.DataFrame(prob, columns=['Probability'])
    prob_df.index = prob_df.index + 1
    prob_df.index.name = 'Id'
    prob_df.to_csv('intento.csv', index=True)


[I 2023-12-05 23:36:11,975] A new study created in memory with name: no-name-60af104b-8d36-41fa-99cb-91dd0be42011
[I 2023-12-05 23:36:13,284] Trial 0 finished with value: 0.0 and parameters: {'rf_n_estimators': 874, 'rf_max_depth': 66}. Best is trial 0 with value: 0.0.


No features selected. Skipping further processing.


[I 2023-12-05 23:37:08,889] Trial 1 finished with value: 0.7820936639118456 and parameters: {'rf_n_estimators': 975, 'rf_max_depth': 2}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:37:10,269] Trial 2 finished with value: 0.0 and parameters: {'rf_n_estimators': 569, 'rf_max_depth': 59}. Best is trial 1 with value: 0.7820936639118456.


No features selected. Skipping further processing.


[I 2023-12-05 23:37:12,049] Trial 3 finished with value: 0.609504132231405 and parameters: {'rf_n_estimators': 428, 'rf_max_depth': 46}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:37:13,090] Trial 4 finished with value: 0.0 and parameters: {'rf_n_estimators': 516, 'rf_max_depth': 79}. Best is trial 1 with value: 0.7820936639118456.


No features selected. Skipping further processing.


[I 2023-12-05 23:37:14,939] Trial 5 finished with value: 0.6149449035812673 and parameters: {'rf_n_estimators': 308, 'rf_max_depth': 47}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:37:20,511] Trial 6 finished with value: 0.6910468319559229 and parameters: {'rf_n_estimators': 281, 'rf_max_depth': 19}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:37:21,994] Trial 7 finished with value: 0.0 and parameters: {'rf_n_estimators': 985, 'rf_max_depth': 83}. Best is trial 1 with value: 0.7820936639118456.


No features selected. Skipping further processing.


[I 2023-12-05 23:37:23,022] Trial 8 finished with value: 0.0 and parameters: {'rf_n_estimators': 803, 'rf_max_depth': 81}. Best is trial 1 with value: 0.7820936639118456.


No features selected. Skipping further processing.


[I 2023-12-05 23:37:24,746] Trial 9 finished with value: 0.6390495867768595 and parameters: {'rf_n_estimators': 166, 'rf_max_depth': 49}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:38:20,508] Trial 10 finished with value: 0.7820936639118456 and parameters: {'rf_n_estimators': 14, 'rf_max_depth': 2}. Best is trial 1 with value: 0.7820936639118456.
[I 2023-12-05 23:40:07,563] Trial 11 finished with value: 0.7822314049586778 and parameters: {'rf_n_estimators': 50, 'rf_max_depth': 1}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:41:55,981] Trial 12 finished with value: 0.7822314049586778 and parameters: {'rf_n_estimators': 687, 'rf_max_depth': 1}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:41:59,782] Trial 13 finished with value: 0.7048209366391185 and parameters: {'rf_n_estimators': 662, 'rf_max_depth': 23}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:42:05,029] Trial 14 finished with value: 0.70482093

No features selected. Skipping further processing.


[I 2023-12-05 23:42:13,233] Trial 16 finished with value: 0.7229338842975206 and parameters: {'rf_n_estimators': 388, 'rf_max_depth': 13}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:42:15,697] Trial 17 finished with value: 0.6730716253443527 and parameters: {'rf_n_estimators': 667, 'rf_max_depth': 37}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:42:19,787] Trial 18 finished with value: 0.6812672176308541 and parameters: {'rf_n_estimators': 194, 'rf_max_depth': 33}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:42:27,576] Trial 19 finished with value: 0.7809917355371901 and parameters: {'rf_n_estimators': 783, 'rf_max_depth': 11}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:42:31,050] Trial 20 finished with value: 0.6537878787878787 and parameters: {'rf_n_estimators': 591, 'rf_max_depth': 30}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:43:26,308] Trial 21 finished with value: 0.

No features selected. Skipping further processing.


[I 2023-12-05 23:49:34,786] Trial 31 finished with value: 0.7822314049586778 and parameters: {'rf_n_estimators': 89, 'rf_max_depth': 1}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:49:46,041] Trial 32 finished with value: 0.7400826446280993 and parameters: {'rf_n_estimators': 85, 'rf_max_depth': 9}. Best is trial 11 with value: 0.7822314049586778.
[I 2023-12-05 23:50:03,123] Trial 33 finished with value: 0.790633608815427 and parameters: {'rf_n_estimators': 727, 'rf_max_depth': 6}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:50:18,446] Trial 34 finished with value: 0.7746556473829201 and parameters: {'rf_n_estimators': 737, 'rf_max_depth': 7}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:50:23,917] Trial 35 finished with value: 0.7058539944903581 and parameters: {'rf_n_estimators': 492, 'rf_max_depth': 21}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:50:38,757] Trial 36 finished with value: 0.7746556473

No features selected. Skipping further processing.


[I 2023-12-05 23:55:56,727] Trial 49 finished with value: 0.7058539944903581 and parameters: {'rf_n_estimators': 625, 'rf_max_depth': 21}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:55:57,683] Trial 50 finished with value: 0.0 and parameters: {'rf_n_estimators': 11, 'rf_max_depth': 92}. Best is trial 33 with value: 0.790633608815427.


No features selected. Skipping further processing.


[I 2023-12-05 23:57:46,554] Trial 51 finished with value: 0.7822314049586778 and parameters: {'rf_n_estimators': 89, 'rf_max_depth': 1}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:58:01,478] Trial 52 finished with value: 0.7746556473829201 and parameters: {'rf_n_estimators': 102, 'rf_max_depth': 7}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:58:28,242] Trial 53 finished with value: 0.7730027548209366 and parameters: {'rf_n_estimators': 50, 'rf_max_depth': 4}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:58:36,197] Trial 54 finished with value: 0.7229338842975206 and parameters: {'rf_n_estimators': 139, 'rf_max_depth': 13}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:58:57,992] Trial 55 finished with value: 0.772038567493113 and parameters: {'rf_n_estimators': 263, 'rf_max_depth': 5}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-05 23:58:59,908] Trial 56 finished with value: 0.607438016528

No features selected. Skipping further processing.


[I 2023-12-06 00:09:50,722] Trial 76 finished with value: 0.7731404958677686 and parameters: {'rf_n_estimators': 117, 'rf_max_depth': 3}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-06 00:10:05,598] Trial 77 finished with value: 0.7746556473829201 and parameters: {'rf_n_estimators': 869, 'rf_max_depth': 7}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-06 00:10:16,179] Trial 78 finished with value: 0.771694214876033 and parameters: {'rf_n_estimators': 391, 'rf_max_depth': 10}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-06 00:10:33,285] Trial 79 finished with value: 0.790633608815427 and parameters: {'rf_n_estimators': 39, 'rf_max_depth': 6}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-06 00:10:38,997] Trial 80 finished with value: 0.6744490358126721 and parameters: {'rf_n_estimators': 46, 'rf_max_depth': 18}. Best is trial 33 with value: 0.790633608815427.
[I 2023-12-06 00:10:56,760] Trial 81 finished with value: 0.790633608815

Best Hyperparameters for RandomForestClassifier: {'rf_n_estimators': 727, 'rf_max_depth': 6}
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	40
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	20
Rejected: 	20
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	20
Rejected: 	20
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	20
Rejected: 	20
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	20
Rejected: 	20
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	16
Rejected: 	20
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	15
Rejected: 	21
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	15
Rejected: 	21
Iteration: 	15 / 100
Confir