In [9]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    thirty = int(len(tuning_data) * precentage_tuning)
    thirty_percent_index = int(len(tuning_data) * precentage_tuning) + int(len(tuning_data) * precentage_tuning)
    tuning_data = tuning_data.iloc[thirty:thirty_percent_index]

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(str(percentage) + name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60 * 3
percentage = 30
trials = 20 + 20
name= "tuning_best_quality_with_stack"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30
0
2880
Total data points: 34085
Data points to be removed: 0
1
1536
2
1536
3
1536


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 10800s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315464.07 GB / 618408.77 GB (51.0%)
Train Data Rows:    29667
Train Data Columns: 79
Tuning Data Rows:    1325
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type para

Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 7486.29s of the 7486.25s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-33.9593	 = Validation score   (-mean_absolute_error)
	9.16s	 = Training   runtime
	0.34s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 7474.47s of the 7474.44s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-35.4325	 = Validation score   (-mean_absolute_error)
	8.48s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 7462.99s of the 7462.96s of remaining time.
	-34.5796	 = Validation score   (-mean_absolute_error)
	45.92s	 = Training   runtime
	1.44s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 7414.36s of the 7414.34s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLo

Fitting model: XGBoost_BAG_L2 ... Training model for up to 3618.47s of the 3618.45s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-34.5477	 = Validation score   (-mean_absolute_error)
	123.67s	 = Training   runtime
	1.01s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 3578.86s of the 3578.83s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-31.011	 = Validation score   (-mean_absolute_error)
	909.0s	 = Training   runtime
	9.02s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 3318.78s of the 3318.75s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-34.1076	 = Validation score   (-mean_absolute_error)
	122.24s	 = Training   runtime
	2.6s	 = Validation runtime
Repeating k-fold bagging: 5/20
Fitting model: LightGBMXT_BAG_L2 ... Training m

0       3.460373e-08
1       2.714687e-09
2       9.053848e-02
3       5.712576e+01
4       3.979971e+02
            ...     
1531    2.072987e+02
1532    7.079070e+01
1533    1.096719e+00
1534    5.516546e-04
1535    6.113033e-08
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_A.csv
0
2880
Total data points: 32844
Data points to be removed: 4248


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 10800s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315368.45 GB / 618408.77 GB (51.0%)
Train Data Rows:    24970
Train Data Columns: 79
Tuning Data Rows:    1087
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 103.9337, 211.75438)
	If 'regression' is not the correct problem_type, please manually specify the problem_type paramet

1
1536
2
1536
3
1536


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', []) :  5 | ['month_5', 'month_6', 'month_7', 'month_8', 'month_9']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 42 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', ['bool'])

	-5.3847	 = Validation score   (-mean_absolute_error)
	181.62s	 = Training   runtime
	1.57s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 6805.38s of the 6805.36s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.7141	 = Validation score   (-mean_absolute_error)
	33.31s	 = Training   runtime
	0.63s	 = Validation runtime
Repeating k-fold bagging: 2/20
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 6769.19s of the 6769.17s of remaining time.
	Fitting 8 child models (S2F1 - S2F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.5605	 = Validation score   (-mean_absolute_error)
	18.01s	 = Training   runtime
	0.71s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 6758.35s of the 6758.32s of remaining time.
	Fitting 8 child models (S2F1 - S2F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.4595	 = Validation score   (-mean_absolute_error

	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.6303	 = Validation score   (-mean_absolute_error)
	127.12s	 = Training   runtime
	1.02s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 3419.19s of the 3419.17s of remaining time.
	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-5.3945	 = Validation score   (-mean_absolute_error)
	929.89s	 = Training   runtime
	9.34s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 3228.36s of the 3228.33s of remaining time.
	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.7018	 = Validation score   (-mean_absolute_error)
	150.94s	 = Training   runtime
	3.42s	 = Validation runtime
Repeating k-fold bagging: 6/20
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 3194.85s of the 3194.82s of remaining time.
	Fitting 8 child models (S6F1 - S6F8) | Fitt

0       -0.002157
1        0.006316
2        0.013031
3        6.674036
4       60.160950
          ...    
1531    43.358566
1532     8.920002
1533    -0.005856
1534     0.000760
1535     0.010069
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_B.csv


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 10800s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315331.67 GB / 618408.77 GB (51.0%)
Train Data Rows:    21032
Train Data Columns: 79
Tuning Data Rows:    885
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 90.45849, 177.65934)
	If 'regression' is not the correct problem_type, please manually specify the problem_type para

0
2880
Total data points: 26095
Data points to be removed: 2110
1
1536
2
1536
3
1536


Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    120336.39 MB
	Train Data (Original)  Memory Usage: 7.97 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', [

	1043.87s	 = Training   runtime
	32.67s	 = Validation runtime
Completed 2/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 719.77s of the 5774.49s of remaining time.
	-4.4526	 = Validation score   (-mean_absolute_error)
	0.42s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting 9 L2 models ...
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 5774.06s of the 5773.95s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-7.109	 = Validation score   (-mean_absolute_error)
	12.16s	 = Training   runtime
	0.3s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 5759.07s of the 5758.96s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-5.9902	 = Validation score   (-mean_absolute_error)
	7.97s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training

Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 3350.11s of the 3350.0s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-9.0467	 = Validation score   (-mean_absolute_error)
	1999.43s	 = Training   runtime
	7.84s	 = Validation runtime
Fitting model: XGBoost_BAG_L2 ... Training model for up to 2847.69s of the 2847.58s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.5346	 = Validation score   (-mean_absolute_error)
	81.45s	 = Training   runtime
	0.82s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 2827.99s of the 2827.88s of remaining time.
	Fitting 8 child models (S4F1 - S4F8) | Fitting with ParallelLocalFoldFittingStrategy
	-4.8087	 = Validation score   (-mean_absolute_error)
	665.64s	 = Training   runtime
	7.05s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 2660.65s of 

0       2.900398e-07
1       7.689651e-08
2       1.457528e-07
3       1.900216e-01
4       4.650283e+01
            ...     
1531    4.482951e+01
1532    1.312926e+01
1533    5.227212e-01
1534    1.802884e-04
1535    7.436037e-07
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_C.csv
Done with run with percentage tuning= 30
