In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.4.0


In [2]:
# Import necessary libraries and functions
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Prepare training data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)

# Undersample the majority class
rus = RandomUnderSampler(random_state=0)
X_resampled, Y_resampled = rus.fit_resample(X, Y)

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=0)

# Prepare testing data
X_real = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Define the objective function for RandomForestClassifier without Boruta feature selection
def random_forest_objective(trial):
    # Define hyperparameters for RandomForestClassifier
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 1000)
    rf_max_depth = trial.suggest_int('rf_max_depth', 1, 100)

    # Create the Random Forest classifier with suggested hyperparameters
    rf_classifier = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=0)

    # Define cross-validation strategy (StratifiedKFold for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Perform cross-validation and get mean AUC
    auc_scores = cross_val_score(rf_classifier, X_train, Y_train, cv=cv, scoring='roc_auc')
    mean_auc = auc_scores.mean()

    return mean_auc

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(random_forest_objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best Hyperparameters for RandomForestClassifier: {best_params}')

# Create the Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'], random_state=0)

# Train the Random Forest classifier
rf_classifier.fit(X_train, Y_train)

# Predictions on testing set
y_pred_test = rf_classifier.predict(X_test)

# Calculate AUC for testing set
auc_test = roc_auc_score(Y_test, y_pred_test)
print(f'AUC for Testing Set: {auc_test:.4f}')

# Save the trained model to a file
joblib.dump(rf_classifier, 'random_forest_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('random_forest_model.joblib')
prob = loaded_model.predict_proba(X_real)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)


[I 2023-12-05 23:37:24,758] A new study created in memory with name: no-name-916c7920-4f00-4528-aae1-5e3ba5f198f0
[I 2023-12-05 23:37:31,614] Trial 0 finished with value: 0.7933884297520661 and parameters: {'rf_n_estimators': 497, 'rf_max_depth': 35}. Best is trial 0 with value: 0.7933884297520661.
[I 2023-12-05 23:37:38,738] Trial 1 finished with value: 0.784159779614325 and parameters: {'rf_n_estimators': 984, 'rf_max_depth': 88}. Best is trial 0 with value: 0.7933884297520661.
[I 2023-12-05 23:37:43,207] Trial 2 finished with value: 0.7899449035812672 and parameters: {'rf_n_estimators': 689, 'rf_max_depth': 76}. Best is trial 0 with value: 0.7933884297520661.
[I 2023-12-05 23:37:47,815] Trial 3 finished with value: 0.784297520661157 and parameters: {'rf_n_estimators': 861, 'rf_max_depth': 16}. Best is trial 0 with value: 0.7933884297520661.
[I 2023-12-05 23:37:52,355] Trial 4 finished with value: 0.7851239669421488 and parameters: {'rf_n_estimators': 834, 'rf_max_depth': 53}. Best i

Best Hyperparameters for RandomForestClassifier: {'rf_n_estimators': 509, 'rf_max_depth': 53}
AUC for Testing Set: 0.7143
