# TP Integrador
## Alumnos: Ezequiel Scordamaglia y Santiago González Achaval
### Optimización de Hiperparámetros


In [26]:
import pickle as pkl
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import optuna
import plotly

In [27]:
# Cargar datos de train preprocesados
data = pd.read_csv('../data/Transformed/Train_BigMart_Prepared.csv')

# Separar features y target
X = data.drop(columns=['Item_Outlet_Sales'])
y = data['Item_Outlet_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

### Optimización de Hiperparámetros para modelo LinearRegression

In [28]:
# Definir la función de objetivo para la optimización de hiperparámetros
def objective_LR(trial):
    """
    Objective function for optimizing hyperparameters of a LinearRegression using Optuna.
    
    Args:
        trial: A `Trial` object from Optuna that contains the state of the optimization trial.
    
    Returns:
        The accuracy score of the LinearRegression using the hyperparameters suggested by Optuna.
    """
    # Definir los hiperparámetros a optimizar para LinearRegression
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    positive = trial.suggest_categorical('positive', [True, False])

    # Crear el clasificador con los hiperparámetros sugeridos por Optuna
    clf =LinearRegression(fit_intercept=fit_intercept, positive=positive)
      
    # Entrenar el clasificador y calcular la precisión en el conjunto de prueba
    clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #score = accuracy_score(y_test, y_pred)

    score = cross_val_score(clf, X_train, y_train, cv=3)
    accuracy = score.mean()

    return accuracy

In [29]:
# Crear el estudio de Optuna y ejecutar la optimización

#sampler = optuna.samplers.RandomSampler(seed=42)
sampler_LR = optuna.samplers.TPESampler(seed=42)
study_LR = optuna.create_study(direction='maximize', sampler=sampler_LR)
study_LR.optimize(objective_LR, n_trials=50)

# Imprimir los resultados de la optimización
print('Best trial: score {}, params {}'.format(study_LR.best_trial.value, study_LR.best_trial.params))

[32m[I 2023-08-10 17:43:11,612][0m A new study created in memory with name: no-name-14994750-2557-45c2-93be-d6b3349a3a86[0m
[32m[I 2023-08-10 17:43:11,637][0m Trial 0 finished with value: 0.45212916165169154 and parameters: {'fit_intercept': False, 'positive': True}. Best is trial 0 with value: 0.45212916165169154.[0m
[32m[I 2023-08-10 17:43:11,661][0m Trial 1 finished with value: 0.5327662634306406 and parameters: {'fit_intercept': True, 'positive': False}. Best is trial 1 with value: 0.5327662634306406.[0m
[32m[I 2023-08-10 17:43:11,684][0m Trial 2 finished with value: 0.5327662634306407 and parameters: {'fit_intercept': False, 'positive': False}. Best is trial 2 with value: 0.5327662634306407.[0m
[32m[I 2023-08-10 17:43:11,707][0m Trial 3 finished with value: 0.5327662634306406 and parameters: {'fit_intercept': True, 'positive': False}. Best is trial 2 with value: 0.5327662634306407.[0m
[32m[I 2023-08-10 17:43:11,729][0m Trial 4 finished with value: 0.45212916165169

Best trial: score 0.5331943476168227, params {'fit_intercept': True, 'positive': True}


### Los mejores Hiperparámetros y el mejor accuracy son:

In [30]:
# Print best params
print('Best params: %s' % study_LR.best_trial.params)
# Print best score
print('Best score: %s' % study_LR.best_value)


Best params: {'fit_intercept': True, 'positive': True}
Best score: 0.5331943476168227


In [31]:
optuna.visualization.plot_contour(study_LR, params=['fit_intercept', 'positive'])

In [32]:
optuna.visualization.plot_parallel_coordinate(study_LR)

In [33]:
optuna.visualization.plot_param_importances(study_LR)

### Optimizacion de Hiperparámetros para un modelo RandomForestRegressor

In [34]:
# Definir la función de objetivo para la optimización de hiperparámetros
def objective_RF(trial):
    """
    Objective function for optimizing hyperparameters of a RandomForestRegressor using Optuna.
    
    Args:
        trial: A `Trial` object from Optuna that contains the state of the optimization trial.
    
    Returns:
        The accuracy score of the RandomForestRegressor using the hyperparameters suggested by Optuna.
    """
    # Definir los hiperparámetros a optimizar para RandomForestRegressor
    n_estimators = trial.suggest_int('n_estimators', 2, 200)
    max_depth = int(trial.suggest_float('max_depth', 1, 32, log=True))
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Crear el clasificador con los hiperparámetros sugeridos por Optuna
    clf =RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap)
          
    # Entrenar el clasificador y calcular la precisión en el conjunto de prueba
    clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #score = accuracy_score(y_test, y_pred)

    score = cross_val_score(clf, X_train, y_train, cv=3)
    accuracy = score.mean()

    return accuracy

In [35]:
# Crear el estudio de Optuna y ejecutar la optimización

#sampler = optuna.samplers.RandomSampler(seed=42)
sampler_RF = optuna.samplers.TPESampler(seed=42)
study_RF = optuna.create_study(direction='maximize', sampler=sampler_RF)
study_RF.optimize(objective_RF, n_trials=50)

# Imprimir los resultados de la optimización
print('Best trial: score {}, params {}'.format(study_RF.best_trial.value, study_RF.best_trial.params))

[32m[I 2023-08-10 17:43:12,967][0m A new study created in memory with name: no-name-64b5d2ad-8495-40dd-bf27-2ae5ca6fc4cc[0m
[32m[I 2023-08-10 17:43:14,574][0m Trial 0 finished with value: 0.5544400496294821 and parameters: {'n_estimators': 76, 'max_depth': 26.975382882791823, 'min_samples_split': 15, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.5544400496294821.[0m
[32m[I 2023-08-10 17:43:16,746][0m Trial 1 finished with value: 0.5639245212755201 and parameters: {'n_estimators': 121, 'max_depth': 11.634706311890067, 'min_samples_split': 2, 'min_samples_leaf': 20, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.5639245212755201.[0m
[32m[I 2023-08-10 17:43:17,572][0m Trial 2 finished with value: 0.5572217800478478 and parameters: {'n_estimators': 62, 'max_depth': 6.163638109085588, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with val

Best trial: score 0.566699079728008, params {'n_estimators': 178, 'max_depth': 8.356399173885055, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}


### Los mejores Hiperparámetros y el mejor accuracy son:

In [36]:
# Print best params
print('Best params: %s' % study_RF.best_trial.params)
# Print best score
print('Best score: %s' % study_RF.best_value)


Best params: {'n_estimators': 178, 'max_depth': 8.356399173885055, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}
Best score: 0.566699079728008


In [37]:
optuna.visualization.plot_contour(study_RF, params=['n_estimators', 'max_depth'])

In [38]:
optuna.visualization.plot_parallel_coordinate(study_RF)

In [39]:
optuna.visualization.plot_param_importances(study_RF)

## Podemos concluir que el modelo RandomForestRegressor logra un mejor Accuracy que el modelo LinearRegression luego de la optimización de hiperparámetros