In [None]:
# Add directory above current directory to path
import sys; sys.path.insert(0, '..')

from functools import partial
from pathlib import Path
from pprint import pprint

import optuna
import pandas as pd
from IPython.display import display

from config.pooling_features import *
from data_preparation.io_utils import read_yaml_file
from dataset.dataset_builder import DatasetBuilder
from model.model_builder import ModelBuilder
from model.model_tuning import OptunaOptimizer
from pipeline.trainer import Trainer

CONFIG_PATH = Path('../config/config.yaml')

In [None]:
# Data Configuration
config = read_yaml_file(CONFIG_PATH)
data_config = config['data']
print(data_config)
conv_pattern = "**/pooling.csv"

In [None]:
dataset_builder = DatasetBuilder(features=POOLING_FEATURES)
pooling_dataset = dataset_builder.create_dataset(
            data_dir=Path('../training_data'),
            test_models=data_config["test_models"],
            pattern=conv_pattern,
        )

In [None]:
print(f"Number of training samples: {len(pooling_dataset.train.input_features)}")
print(f"Number of testing samples: {len(pooling_dataset.test.input_features)}")

## Baseline Models


Baseline models for power and runtime models, we will use the mean of train dataset as prediction.


### Baseline Result

Test dataset RMSPE Power : 148.92%

Test dataset RMSPE Runtime: 119.05%

In [None]:
trainer = Trainer(data_config=data_config, model_config=config['model'], features=POOLING_FEATURES)

In [None]:
mean_power = pooling_dataset.train.power.mean()
target = pooling_dataset.test.power

In [None]:
pprint(trainer.eval_metrics(actual=target, pred=[mean_power]*len(target)))

In [None]:
mean_runtime = pooling_dataset.train.runtime.mean()
target = pooling_dataset.test.runtime

In [None]:
pprint(trainer.eval_metrics(actual=target, pred=[mean_runtime]*len(target)))

## Power

[Optuna](https://optuna.org/) library is used to perform hyperparameter tuning for power model.

Best trial configuration

```json
FrozenTrial(number=95, state=1, values=[0.6680917520358511], datetime_start=datetime.datetime(2024, 11, 26, 17, 8, 9, 332736), datetime_complete=datetime.datetime(2024, 11, 26, 17, 8, 9, 523217), params={'degree': 1, 'log_scale': False, 'special_features': True, 'scalers': 'minmax', 'max_iter': 30832, 'n_alphas': 218, 'fit_intercept': True, 'positive': True}, user_attrs={'testing_mean_absolute_error': 1.219418214248221, 'testing_mean_absolute_percentage_error': 0.39163211567035033, 'testing_mean_squared_error': 2.4084354893996065, 'testing_r2_score': 0.6680917520358511, 'testing_root_mean_squared_error': 1.5519134928853497, 'testing_root_mean_squared_percentage_error': 62.51547421471385}, system_attrs={}, intermediate_values={}, distributions={'degree': IntDistribution(high=4, log=False, low=1, step=1), 'log_scale': CategoricalDistribution(choices=(True, False)), 'special_features': CategoricalDistribution(choices=(True, False)), 'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'max_iter': IntDistribution(high=50000, log=True, low=1000, step=1), 'n_alphas': IntDistribution(high=1000, log=True, low=100, step=1), 'fit_intercept': CategoricalDistribution(choices=(True, False)), 'positive': CategoricalDistribution(choices=(True, False))}, trial_id=96, value=None)
```

Test dataset metrics of best trial

```json
{
    'testing_mean_absolute_error': 1.219418214248221, 
    'testing_mean_absolute_percentage_error': 0.39163211567035033, 
    'testing_mean_squared_error': 2.4084354893996065, 
    'testing_r2_score': 0.6680917520358511, 
    'testing_root_mean_squared_error': 1.5519134928853497, 
    'testing_root_mean_squared_percentage_error': 62.51547421471385
}

```


In [None]:
X_train = pooling_dataset.train.input_features.values
y_power_train = pooling_dataset.train.power.values
print(f"Training shape: {X_train.shape}, {y_power_train.shape}")

X_test = pooling_dataset.test.input_features.values
y_power_test = pooling_dataset.test.power.values
print(f"Testing shape: {X_test.shape}, {y_power_test.shape}")

In [None]:
model_builder = ModelBuilder()
optimizer = OptunaOptimizer(X_train=X_train, y_train=y_power_train, X_test=X_test, y_test=y_power_test, model_builder=model_builder)

In [None]:
%%time

# Ignore ConvergenceWarning from sklearn to avoid tab crash
from warnings import filterwarnings
filterwarnings('ignore')

# Maximise the test R^2 score during tuning
power_study = optuna.create_study(study_name='pooling_power_model_tuning', direction="maximize", storage="sqlite:///pooling_power_model_tuning.db")
# Run study for 50 trials
power_study.optimize(partial(optimizer.objective, 
                             features_mapping=dataset_builder.features_mapping, 
                             special_terms_list=[TOTAL_POOLING_INPUT_FEATURES, TOTAL_POOLING_OUTPUT_FEATURES, TOTAL_POOLING_NO_OPS]), 
               n_trials=100)

In [None]:
# print the best performing pipeline
pprint(power_study.best_trial)

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(power_study)

## Runtime

[Optuna](https://optuna.org/) library is used to perform hyperparameter tuning for runtime model.

Best trial configuration

```json
FrozenTrial(number=66, state=1, values=[0.9985231887973071], datetime_start=datetime.datetime(2024, 11, 26, 17, 11, 12, 676146), datetime_complete=datetime.datetime(2024, 11, 26, 17, 11, 19, 457365), params={'degree': 4, 'log_scale': False, 'special_features': False, 'scalers': 'minmax', 'max_iter': 36553, 'n_alphas': 441, 'fit_intercept': True, 'positive': False}, user_attrs={'testing_mean_absolute_error': 0.001255648137913211, 'testing_mean_absolute_percentage_error': 0.06084164858645989, 'testing_mean_squared_error': 2.4765212416187023e-06, 'testing_r2_score': 0.9985231887973071, 'testing_root_mean_squared_error': 0.0015736966803099961, 'testing_root_mean_squared_percentage_error': 8.505752819296054}, system_attrs={}, intermediate_values={}, distributions={'degree': IntDistribution(high=4, log=False, low=1, step=1), 'log_scale': CategoricalDistribution(choices=(True, False)), 'special_features': CategoricalDistribution(choices=(True, False)), 'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'max_iter': IntDistribution(high=50000, log=True, low=1000, step=1), 'n_alphas': IntDistribution(high=1000, log=True, low=100, step=1), 'fit_intercept': CategoricalDistribution(choices=(True, False)), 'positive': CategoricalDistribution(choices=(True, False))}, trial_id=67, value=None)
```

Test data metrics on best trial

```json
{
    'testing_mean_absolute_error': 0.001255648137913211, 
    'testing_mean_absolute_percentage_error': 0.06084164858645989, 
    'testing_mean_squared_error': 2.4765212416187023e-06, 
    'testing_r2_score': 0.9985231887973071, 
    'testing_root_mean_squared_error': 0.0015736966803099961, 
    'testing_root_mean_squared_percentage_error': 8.505752819296054
}
```

In [None]:
X_train = pooling_dataset.train.input_features.values
y_runtime_train = pooling_dataset.train.runtime.values
print(f"Training shape: {X_train.shape}, {y_runtime_train.shape}")

X_test = pooling_dataset.test.input_features.values
y_runtime_test = pooling_dataset.test.runtime.values
print(f"Testing shape: {X_test.shape}, {y_runtime_test.shape}")

In [None]:
model_builder = ModelBuilder()
optimizer = OptunaOptimizer(X_train=X_train, y_train=y_runtime_train, X_test=X_test, y_test=y_runtime_test, model_builder=model_builder)

In [None]:
%%time

# Ignore ConvergenceWarning from sklearn to avoid tab crash
from warnings import filterwarnings
filterwarnings('ignore')

# Maximise the test R^2 score during tuning
runtime_study = optuna.create_study(study_name='pooling_runtime_model_tuning', direction="maximize", storage="sqlite:///pooling_runtime_model_tuning.db")
# Run study for 100 trials
runtime_study.optimize(partial(optimizer.objective, 
                               features_mapping=dataset_builder.features_mapping, 
                               special_terms_list=[TOTAL_POOLING_INPUT_FEATURES, TOTAL_POOLING_OUTPUT_FEATURES, TOTAL_POOLING_NO_OPS]), 
               n_trials=100)

In [None]:
# print the best performing pipeline
pprint(runtime_study.best_trial)

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(runtime_study)