In [1]:
# Package imports
import os
import sys
import pandas as pd
import numpy as np
import optuna
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Local imports
root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.append(src_path)
from feature_engineering import FeatureEngineeringPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Run feature engineering pipeline
fe_pipeline = FeatureEngineeringPipeline(os.path.join(root_path, "data", "Train_BigMart.csv"),
                           os.path.join(root_path, "data", "transformed_dataset.csv"))
fe_pipeline.run()

2023-07-26 18:39:31 - DEBUG - FeatureEngineering - data was read succesfully.
2023-07-26 18:39:31 - DEBUG - FeatureEngineering - data was read succesfully.
2023-07-26 18:39:31 - DEBUG - FeatureEngineering - establishment years were corrected.
2023-07-26 18:39:31 - DEBUG - FeatureEngineering - establishment years were corrected.
2023-07-26 18:39:31 - DEBUG - FeatureEngineering - unique labels were created for Item_Fat_Content
2023-07-26 18:39:31 - DEBUG - FeatureEngineering - unique labels were created for Item_Fat_Content
2023-07-26 18:39:33 - DEBUG - FeatureEngineering - missing Item_Weight was cleaned.
2023-07-26 18:39:33 - DEBUG - FeatureEngineering - missing Item_Weight was cleaned.
2023-07-26 18:39:33 - DEBUG - FeatureEngineering - missing Outlet_Size was cleaned.
2023-07-26 18:39:33 - DEBUG - FeatureEngineering - missing Outlet_Size was cleaned.
2023-07-26 18:39:33 - DEBUG - FeatureEngineering - none category was added to Item_Fat_Content.
2023-07-26 18:39:33 - DEBUG - FeatureEng

In [5]:
# Import transformed data
data = pd.read_csv(os.path.join(root_path, "data", "transformed_dataset.csv"), index_col=0)

In [8]:
# Train-validation split
X = data.drop(columns=["Item_Outlet_Sales"])
y = data["Item_Outlet_Sales"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=28)

# Define the functions for optimizing HPs with optuna
def objective(trial):
    # Define the hyperparameters to tune 
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    positive = trial.suggest_categorical('positive', [True, False])
    copy_X = trial.suggest_categorical('copy_X', [True, False])
    n_jobs = trial.suggest_int('n_jobs', 1, 4)

    # Create a Linear Regression model with the suggested hyperparameters
    model = LinearRegression(
        fit_intercept=fit_intercept,
        positive=positive,
        copy_X=copy_X,
        n_jobs=n_jobs
    )

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predict on the validation data
    y_pred = model.predict(X_val)

    # Calculate the mean squared error as the objective to minimize
    mse = mean_squared_error(y_val, y_pred)
    return mse

def optimize_linear_regression():
    # Create a study object and optimize the objective function
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    # Get the best hyperparameters
    best_params = study.best_params

    # Get best value
    best_score = study.best_trial.value
    
    return best_params, best_score

In [9]:
# Fix the random seed
seed = 28
np.random.seed(seed)

# Run optimization
best_params, best_score = optimize_linear_regression()

[I 2023-07-26 18:40:19,244] A new study created in memory with name: no-name-1984c86b-2ba2-427f-b750-24e900ed1d76
[I 2023-07-26 18:40:19,266] Trial 0 finished with value: 1278104.0105896648 and parameters: {'fit_intercept': False, 'positive': False, 'copy_X': True, 'n_jobs': 1}. Best is trial 0 with value: 1278104.0105896648.
[I 2023-07-26 18:40:19,286] Trial 1 finished with value: 1508584.7015556984 and parameters: {'fit_intercept': False, 'positive': True, 'copy_X': True, 'n_jobs': 4}. Best is trial 0 with value: 1278104.0105896648.
[I 2023-07-26 18:40:19,304] Trial 2 finished with value: 1278104.0105896648 and parameters: {'fit_intercept': False, 'positive': False, 'copy_X': True, 'n_jobs': 4}. Best is trial 0 with value: 1278104.0105896648.
[I 2023-07-26 18:40:19,319] Trial 3 finished with value: 1278104.0105896648 and parameters: {'fit_intercept': False, 'positive': False, 'copy_X': True, 'n_jobs': 1}. Best is trial 0 with value: 1278104.0105896648.
[I 2023-07-26 18:40:19,323] Tri

In [10]:
print(f"Best params for Linear Regression: {best_params}", 
      f"\nMSE = {best_score}")

Best params for Linear Regression: {'fit_intercept': False, 'positive': False, 'copy_X': True, 'n_jobs': 1} 
MSE = 1278104.0105896648
