In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.4.0


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
import joblib

In [1]:
# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Prepare training data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Prepare testing data
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Best hyperparameters for RandomForestClassifier
rf_best_params = {'n_estimators': 138, 'max_depth': 3}

# Best hyperparameters for GradientBoostingClassifier
gb_best_params = {'n_estimators': 178, 'learning_rate': 0.016627816667251865, 'max_depth': 4}

# Create the base classifiers
rf_classifier = RandomForestClassifier(**rf_best_params, random_state=0)
gb_classifier = GradientBoostingClassifier(**gb_best_params, random_state=0)

# Create the stacking classifier with the base classifiers
stacking_classifier = StackingClassifier(
    estimators=[('random_forest', rf_classifier), ('gradient_boosting', gb_classifier)],
    final_estimator=RandomForestClassifier(random_state=0)  # You can choose a different final estimator if needed
)

# Train the stacking classifier
stacking_classifier.fit(X_train, Y_train)

# Predictions on training set
y_pred_train = stacking_classifier.predict(X_train)

# Calculate AUC for training set
auc_train = roc_auc_score(Y_train, y_pred_train)
print(f'AUC for Training Set: {auc_train:.4f}')

# Predictions on testing set
y_pred_test = stacking_classifier.predict(X_test)

# Calculate AUC for testing set
auc_test = roc_auc_score(Y_test, y_pred_test)
print(f'AUC for Testing Set: {auc_test:.4f}')

# Save the trained model to a file
joblib.dump(stacking_classifier, 'stacking_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('stacking_model.joblib')
prob = loaded_model.predict_proba(X)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)


AUC for Training Set: 0.6073
AUC for Testing Set: 0.5866


In [6]:
# Read data
data = pd.read_excel('TrainClass.xlsx')
real_data = pd.read_excel('TestClass.xlsx')

# Prepare training data
Y, X = data['FRACASO'], data.drop(['FRACASO', 'CODIGO_EMPRESA'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Prepare testing data
X = real_data.drop(['CODIGO_EMPRESA'], axis=1)

# Define the objective function for StackingClassifier
def stacking_objective(trial):
    # Define hyperparameters for RandomForestClassifier
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 200)
    rf_max_depth = trial.suggest_int('rf_max_depth', 1, 10)

    # Define hyperparameters for GradientBoostingClassifier
    gb_n_estimators = trial.suggest_int('gb_n_estimators', 50, 200)
    gb_learning_rate = trial.suggest_float('gb_learning_rate', 0.001, 0.1)
    gb_max_depth = trial.suggest_int('gb_max_depth', 1, 10)

    # Create the base classifiers with suggested hyperparameters
    rf_classifier = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, random_state=0)
    gb_classifier = GradientBoostingClassifier(n_estimators=gb_n_estimators, learning_rate=gb_learning_rate, max_depth=gb_max_depth, random_state=0)

    # Create the stacking classifier with the base classifiers
    stacking_classifier = StackingClassifier(
        estimators=[('random_forest', rf_classifier), ('gradient_boosting', gb_classifier)],
        final_estimator=RandomForestClassifier(random_state=0)  # You can choose a different final estimator if needed
    )

    # Define cross-validation strategy (StratifiedKFold for classification)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Perform cross-validation and get mean AUC
    auc_scores = cross_val_score(stacking_classifier, X_train, Y_train, cv=cv, scoring='roc_auc')
    mean_auc = auc_scores.mean()

    return mean_auc

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(stacking_objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best Hyperparameters for StackingClassifier: {best_params}')

# Create the stacking classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'], random_state=0)
gb_classifier = GradientBoostingClassifier(n_estimators=best_params['gb_n_estimators'], learning_rate=best_params['gb_learning_rate'], max_depth=best_params['gb_max_depth'], random_state=0)

stacking_classifier = StackingClassifier(
    estimators=[('random_forest', rf_classifier), ('gradient_boosting', gb_classifier)],
    final_estimator=RandomForestClassifier(random_state=0)  # You can choose a different final estimator if needed
)

# Train the stacking classifier
stacking_classifier.fit(X_train, Y_train)

# Predictions on training set
y_pred_train = stacking_classifier.predict(X_train)

# Calculate AUC for training set
auc_train = roc_auc_score(Y_train, y_pred_train)
print(f'AUC for Training Set: {auc_train:.4f}')

# Predictions on testing set
y_pred_test = stacking_classifier.predict(X_test)

# Calculate AUC for testing set
auc_test = roc_auc_score(Y_test, y_pred_test)
print(f'AUC for Testing Set: {auc_test:.4f}')

# Save the trained model to a file
joblib.dump(stacking_classifier, 'stacking_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('stacking_model.joblib')
prob = loaded_model.predict_proba(X)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)

[I 2023-12-05 15:31:39,359] A new study created in memory with name: no-name-f6994b8a-8d23-4ce0-b9db-4604d4f14bf2
[I 2023-12-05 15:33:32,625] Trial 0 finished with value: 0.7897013614404917 and parameters: {'rf_n_estimators': 106, 'rf_max_depth': 4, 'gb_n_estimators': 165, 'gb_learning_rate': 0.08799250022732304, 'gb_max_depth': 4}. Best is trial 0 with value: 0.7897013614404917.
[I 2023-12-05 15:34:50,408] Trial 1 finished with value: 0.7450110891523936 and parameters: {'rf_n_estimators': 160, 'rf_max_depth': 5, 'gb_n_estimators': 185, 'gb_learning_rate': 0.03956330584826514, 'gb_max_depth': 2}. Best is trial 0 with value: 0.7897013614404917.
[I 2023-12-05 15:37:06,089] Trial 2 finished with value: 0.7733988801054019 and parameters: {'rf_n_estimators': 164, 'rf_max_depth': 3, 'gb_n_estimators': 141, 'gb_learning_rate': 0.0421620043113306, 'gb_max_depth': 6}. Best is trial 0 with value: 0.7897013614404917.
[I 2023-12-05 15:38:18,896] Trial 3 finished with value: 0.7332183794466403 and 

Best Hyperparameters for StackingClassifier: {'rf_n_estimators': 117, 'rf_max_depth': 7, 'gb_n_estimators': 153, 'gb_learning_rate': 0.0855036483139636, 'gb_max_depth': 9}
AUC for Training Set: 0.6610
AUC for Testing Set: 0.5397


In [1]:
# Save the trained model to a file
joblib.dump(stacking_classifier, 'stacking_model.joblib')

# Load model and generate probability CSV
loaded_model = joblib.load('stacking_model.joblib')
prob = loaded_model.predict_proba(X)[:, 1]
prob_df = pd.DataFrame(prob, columns=['Probability'])
prob_df.index = prob_df.index + 1
prob_df.index.name = 'Id'
prob_df.to_csv('intento.csv', index=True)

NameError: ignored