In [7]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    thirty = int(len(tuning_data) * precentage_tuning)
    thirty_percent_index = int(len(tuning_data) * precentage_tuning) + int(len(tuning_data) * precentage_tuning)
    tuning_data = tuning_data.iloc[thirty:thirty_percent_index]

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(str(percentage) + name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60
percentage = 30
trials = 20 + 20
name= "tuning_best_quality_with_stack"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30
0
2880
Total data points: 34085
Data points to be removed: 0


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315885.13 GB / 618408.77 GB (51.1%)
Train Data Rows:    29667
Train Data Columns: 79
Tuning Data Rows:    1325
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

1
1536
2
1536
3
1536


	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', []) :  5 | ['month_5', 'month_6', 'month_7', 'month_8', 'month_9']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 42 | ['absol

	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-31.182	 = Validation score   (-mean_absolute_error)
	257.07s	 = Training   runtime
	1.78s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 247.6s of the 247.57s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-34.6541	 = Validation score   (-mean_absolute_error)
	34.29s	 = Training   runtime
	0.57s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.0s of the 210.56s of remaining time.
	-31.1781	 = Validation score   (-mean_absolute_error)
	0.37s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 3389.85s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutoGluonTesting/")


0       4.470402e-07
1       9.092030e-08
2       1.194059e-01
3       4.961147e+01
4       3.350621e+02
            ...     
1531    1.746136e+02
1532    6.401384e+01
1533    1.593259e+00
1534    8.398810e-04
1535    4.017200e-07
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_A.csv


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13


0
2880
Total data points: 32844
Data points to be removed: 4248
1
1536
2
1536
3
1536


Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315533.24 GB / 618408.77 GB (51.0%)
Train Data Rows:    24970
Train Data Columns: 79
Tuning Data Rows:    1087
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 103.9337, 211.75438)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    123230.18 MB
	Train Data (Original)  Memory Usage: 9.51 MB (0.0% of available memory)
	Inferring data type of each feature based on column

	11.14s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L2 ... Training model for up to 1133.43s of the 1133.4s of remaining time.
	-6.8898	 = Validation score   (-mean_absolute_error)
	7.33s	 = Training   runtime
	1.1s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 1124.0s of the 1123.97s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-7.5295	 = Validation score   (-mean_absolute_error)
	563.62s	 = Training   runtime
	2.03s	 = Validation runtime
Fitting model: XGBoost_BAG_L2 ... Training model for up to 557.46s of the 557.44s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-6.5376	 = Validation score   (-mean_absolute_error)
	23.83s	 = Training   runtime
	0.22s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 530.69s of the 530.66s of remaining time.
	Fit

0       -0.003690
1       -0.002144
2        0.003434
3        5.808100
4       54.339439
          ...    
1531    42.632210
1532     8.673139
1533     0.041077
1534    -0.005693
1535     0.002740
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_B.csv


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315502.66 GB / 618408.77 GB (51.0%)
Train Data Rows:    21032
Train Data Columns: 79
Tuning Data Rows:    885
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 90.45849, 177.65934)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

0
2880
Total data points: 26095
Data points to be removed: 2110
1
1536
2
1536
3
1536


Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    125785.09 MB
	Train Data (Original)  Memory Usage: 7.97 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', [

	-6.598	 = Validation score   (-mean_absolute_error)
	21.34s	 = Training   runtime
	0.16s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 647.43s of the 647.41s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-4.7927	 = Validation score   (-mean_absolute_error)
	176.57s	 = Training   runtime
	2.21s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 467.82s of the 467.8s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-5.9996	 = Validation score   (-mean_absolute_error)
	45.43s	 = Training   runtime
	0.57s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.0s of the 419.81s of remaining time.
	-4.7927	 = Validation score   (-mean_absolute_error)
	0.35s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon trainin

0       1.221131e-07
1       3.346197e-08
2       5.377809e-08
3       1.822123e-01
4       4.719907e+01
            ...     
1531    4.496088e+01
1532    1.356952e+01
1533    4.955992e-01
1534    1.484644e-04
1535    5.285103e-07
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_C.csv
Done with run with percentage tuning= 30
