In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    thirty_percent_index = int(len(tuning_data) * precentage_tuning)
    tuning_data = tuning_data.iloc[:thirty_percent_index]

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(str(percentage) + name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60
percentage = 30
trials = 20 + 20
name= "tuning_best_quality_with_stack"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30
0
2880
Total data points: 34085
Data points to be removed: 0
1
1536
2
1536
3
1536


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   316658.65 GB / 618408.77 GB (51.2%)
Train Data Rows:    29667
Train Data Columns: 79
Tuning Data Rows:    1325
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-65.6147	 = Validation score   (-mean_absolute_error)
	8.62s	 = Training   runtime
	0.41s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 1187.31s of the 1187.28s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-66.6553	 = Validation score   (-mean_absolute_error)
	9.34s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 1175.45s of the 1175.41s of remaining time.
	-63.4479	 = Validation score   (-mean_absolute_error)
	46.25s	 = Training   runtime
	1.73s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 1110.3s of the 1110.26s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-66.9216	 = Validation score   (-mean_absolute_error)
	37.1s	 = Training   runtime


0         0.000000
1         0.000000
2         0.094600
3        45.318901
4       257.243073
           ...    
1531    220.789795
1532     75.671936
1533      2.513867
1534      0.000733
1535      0.000000
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_A.csv
0
2880
Total data points: 32844
Data points to be removed: 4248


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   316177.47 GB / 618408.77 GB (51.1%)
Train Data Rows:    24970
Train Data Columns: 79
Tuning Data Rows:    1087
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 103.9337, 211.75438)
	If 'regression' is not the correct problem_type, please manually specify the problem_type paramete

1
1536
2
1536
3
1536


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', []) :  5 | ['month_5', 'month_6', 'month_7', 'month_8', 'month_9']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 42 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', ['bool'])

	-2.73	 = Validation score   (-mean_absolute_error)
	197.57s	 = Training   runtime
	2.47s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 329.27s of the 329.23s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-2.8288	 = Validation score   (-mean_absolute_error)
	34.48s	 = Training   runtime
	0.6s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.0s of the 290.53s of remaining time.
	-2.6792	 = Validation score   (-mean_absolute_error)
	0.39s	 = Training   runtime
	0.04s	 = Validation runtime
AutoGluon training complete, total runtime = 3309.93s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutoGluonTesting/")


0       2.786812e-11
1       2.085799e-12
2       7.995170e-12
3       7.867450e+00
4       6.046466e+01
            ...     
1531    4.786567e+01
1532    1.023238e+01
1533    5.187640e-02
1534    3.581934e-03
1535    9.868707e-11
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: tuning_best_quality_with_stack_30_B.csv
0
2880
Total data points: 26095
Data points to be removed: 2110


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   316178.92 GB / 618408.77 GB (51.1%)
Train Data Rows:    21032
Train Data Columns: 79
Tuning Data Rows:    885
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 90.45849, 177.65934)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

1
1536
2
1536
3
1536


	Available Memory:                    120477.12 MB
	Train Data (Original)  Memory Usage: 7.97 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', []) :  5 | ['month_5', 'month_6', 'month_7'