In [None]:
# Add directory above current directory to path
import sys; sys.path.insert(0, '..')

from functools import partial
from pathlib import Path
from pprint import pprint

import optuna
import pandas as pd
from IPython.display import display

from config.dense_features import *
from data_preparation.io_utils import read_yaml_file
from dataset.dataset_builder import DatasetBuilder
from model.model_builder import ModelBuilder
from model.model_tuning import OptunaOptimizer
from pipeline.trainer import Trainer

CONFIG_PATH = Path('../config/config.yaml')

In [None]:
# Data Configuration
config = read_yaml_file(CONFIG_PATH)
data_config = config['data']
print(data_config)
conv_pattern = "**/dense.csv"

In [None]:
dataset_builder = DatasetBuilder(features=DENSE_FEATURES)
dense_dataset = dataset_builder.create_dataset(
            data_dir=Path('../training_data'),
            test_models=data_config["test_models"],
            pattern=conv_pattern,
        )

In [None]:
print(f"Number of training samples: {len(dense_dataset.train.input_features)}")
print(f"Number of testing samples: {len(dense_dataset.test.input_features)}")

## Baseline Models


Baseline models for power and runtime models, we will use the mean of train dataset as prediction.


### Baseline Result

Test dataset RMSPE Power : 243.72%

Test dataset RMSPE Runtime: 416.88%

In [None]:
trainer = Trainer(data_config=data_config, model_config=config['model'], features=DENSE_FEATURES)

In [None]:
mean_power = dense_dataset.train.power.mean()
target = dense_dataset.test.power

In [None]:
pprint(trainer.eval_metrics(actual=target, pred=[mean_power]*len(target)))

In [None]:
mean_runtime = dense_dataset.train.runtime.mean()
target = dense_dataset.test.runtime

In [None]:
pprint(trainer.eval_metrics(actual=target, pred=[mean_runtime]*len(target)))

## Power

[Optuna](https://optuna.org/) library is used to perform hyperparameter tuning for power model.

Best trial configuration

```json
FrozenTrial(number=83, state=1, values=[0.3461064156797421], datetime_start=datetime.datetime(2024, 11, 26, 16, 57, 32, 119055), datetime_complete=datetime.datetime(2024, 11, 26, 16, 57, 32, 304749), params={'degree': 4, 'log_scale': True, 'special_features': True, 'scalers': 'robust', 'max_iter': 1616, 'n_alphas': 102, 'fit_intercept': False, 'positive': False}, user_attrs={'testing_mean_absolute_error': 2.1073726925781435, 'testing_mean_absolute_percentage_error': 0.5881811267685447, 'testing_mean_squared_error': 6.104086079316093, 'testing_r2_score': 0.3461064156797421, 'testing_root_mean_squared_error': 2.4706448711452023, 'testing_root_mean_squared_percentage_error': 67.98575856021878}, system_attrs={}, intermediate_values={}, distributions={'degree': IntDistribution(high=4, log=False, low=1, step=1), 'log_scale': CategoricalDistribution(choices=(True, False)), 'special_features': CategoricalDistribution(choices=(True, False)), 'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'max_iter': IntDistribution(high=50000, log=True, low=1000, step=1), 'n_alphas': IntDistribution(high=1000, log=True, low=100, step=1), 'fit_intercept': CategoricalDistribution(choices=(True, False)), 'positive': CategoricalDistribution(choices=(True, False))}, trial_id=84, value=None)
```

Test dataset metrics for the best trial

```json
{   
    'testing_mean_absolute_error': 2.1073726925781435, 
    'testing_mean_absolute_percentage_error': 0.5881811267685447, 
    'testing_mean_squared_error': 6.104086079316093, 
    'testing_r2_score': 0.3461064156797421, 
    'testing_root_mean_squared_error': 2.4706448711452023, 
    'testing_root_mean_squared_percentage_error': 67.98575856021878
}
```



In [None]:
X_train = dense_dataset.train.input_features.values
y_power_train = dense_dataset.train.power.values
print(f"Training shape: {X_train.shape}, {y_power_train.shape}")

X_test = dense_dataset.test.input_features.values
y_power_test = dense_dataset.test.power.values
print(f"Testing shape: {X_test.shape}, {y_power_test.shape}")

In [None]:
model_builder = ModelBuilder()
optimizer = OptunaOptimizer(X_train=X_train, y_train=y_power_train, X_test=X_test, y_test=y_power_test, model_builder=model_builder)

In [None]:
%%time

# Takes about 40 secs to complete

# Ignore ConvergenceWarning from sklearn to avoid tab crash
from warnings import filterwarnings
filterwarnings('ignore')

# Maximize test R^2 score during tuning
power_study = optuna.create_study(study_name='dense_power_model_tuning', direction="maximize", storage="sqlite:///dense_power_model_tuning.db")
# Run study for 100 trials
power_study.optimize(partial(optimizer.objective, 
                             features_mapping=dataset_builder.features_mapping,
                             special_terms_list=None),
               n_trials=100)

In [None]:
# print the best performing pipeline
pprint(power_study.best_trial)

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(power_study)

## Runtime

[Optuna](https://optuna.org/) library is used to perform hyperparameter tuning for runtime model.

Best trial

```json
FrozenTrial(number=67, state=1, values=[-0.0995709243010976], datetime_start=datetime.datetime(2024, 11, 26, 17, 2, 18, 862242), datetime_complete=datetime.datetime(2024, 11, 26, 17, 2, 30, 551254), params={'degree': 4, 'log_scale': False, 'special_features': False, 'scalers': 'standard', 'max_iter': 46296, 'n_alphas': 112, 'fit_intercept': True, 'positive': False}, user_attrs={'testing_mean_absolute_error': 0.608014977251175, 'testing_mean_absolute_percentage_error': 7.304681270592265, 'testing_mean_squared_error': 1.4327889539950647, 'testing_r2_score': -0.0995709243010976, 'testing_root_mean_squared_error': 1.1969916265350666, 'testing_root_mean_squared_percentage_error': 998.2098464331851}, system_attrs={}, intermediate_values={}, distributions={'degree': IntDistribution(high=4, log=False, low=1, step=1), 'log_scale': CategoricalDistribution(choices=(True, False)), 'special_features': CategoricalDistribution(choices=(True, False)), 'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'max_iter': IntDistribution(high=50000, log=True, low=1000, step=1), 'n_alphas': IntDistribution(high=1000, log=True, low=100, step=1), 'fit_intercept': CategoricalDistribution(choices=(True, False)), 'positive': CategoricalDistribution(choices=(True, False))}, trial_id=68, value=None)
```

Test dataset metrics for best trial

```json
{   
    'testing_mean_absolute_error': 0.608014977251175, 
    'testing_mean_absolute_percentage_error': 7.304681270592265, 
    'testing_mean_squared_error': 1.4327889539950647, 
    'testing_r2_score': -0.0995709243010976, 
    'testing_root_mean_squared_error': 1.1969916265350666, 
    'testing_root_mean_squared_percentage_error': 998.2098464331851
}
```

In [None]:
X_train = dense_dataset.train.input_features.values
y_runtime_train = dense_dataset.train.runtime.values
print(f"Training shape: {X_train.shape}, {y_runtime_train.shape}")

X_test = dense_dataset.test.input_features.values
y_runtime_test = dense_dataset.test.runtime.values
print(f"Testing shape: {X_test.shape}, {y_runtime_test.shape}")

In [None]:
model_builder = ModelBuilder()
optimizer = OptunaOptimizer(X_train=X_train, y_train=y_runtime_train, X_test=X_test, y_test=y_runtime_test, model_builder=model_builder)

In [None]:
%%time

# Takes about 10 min secs to complete

# Ignore ConvergenceWarning from sklearn to avoid tab crash
from warnings import filterwarnings
filterwarnings('ignore')

# Maximize the test R^2 score during tuning
runtime_study = optuna.create_study(study_name='dense_runtime_model_tuning', direction="maximize", storage="sqlite:///dense_runtime_model_tuning.db")
# Run study for 100 trials
runtime_study.optimize(partial(optimizer.objective, 
                             features_mapping=dataset_builder.features_mapping,
                             special_terms_list=None),
               n_trials=200)

In [None]:
# print the best performing pipeline
pprint(runtime_study.best_trial)

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(runtime_study)