In [11]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    tuning_data = tuning_data.sample(frac=0.3, random_state=534)

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60 * 2
percentage = 30
trials = 20 + 20
name= "best_quality_random_seed"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30
0
2880
Total data points: 34085
Data points to be removed: 0
1
1536
2
1536
3
1536


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 7200s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315314.46 GB / 618408.77 GB (51.0%)
Train Data Rows:    29667
Train Data Columns: 79
Tuning Data Rows:    1325
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 3663.98s of the 3663.95s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-116.8979	 = Validation score   (-mean_absolute_error)
	8.68s	 = Training   runtime
	0.44s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 3652.64s of the 3652.61s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-118.4917	 = Validation score   (-mean_absolute_error)
	9.5s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 3640.5s of the 3640.48s of remaining time.
	-118.5574	 = Validation score   (-mean_absolute_error)
	45.97s	 = Training   runtime
	1.37s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 3591.98s of the 3591.95s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelL

0        -0.009091
1         0.109723
2         0.275415
3        60.577763
4       388.245850
           ...    
1531    218.762787
1532     72.793739
1533      1.571804
1534      0.183743
1535      0.271718
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: best_quality_random_seed_30_A.csv


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20


0
2880
Total data points: 32844
Data points to be removed: 4248
1
1536
2
1536
3
1536


Beginning AutoGluon training ... Time limit = 7200s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315286.01 GB / 618408.77 GB (51.0%)
Train Data Rows:    24970
Train Data Columns: 79
Tuning Data Rows:    1088
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1152.3, 0.0, 103.9337, 211.75438)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:   

	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-18.7138	 = Validation score   (-mean_absolute_error)
	12.72s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L2 ... Training model for up to 4076.06s of the 4076.04s of remaining time.
	-18.7934	 = Validation score   (-mean_absolute_error)
	7.39s	 = Training   runtime
	1.07s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 4066.83s of the 4066.81s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-19.8938	 = Validation score   (-mean_absolute_error)
	595.03s	 = Training   runtime
	2.28s	 = Validation runtime
Fitting model: XGBoost_BAG_L2 ... Training model for up to 3469.03s of the 3469.01s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-19.1891	 = Validation score   (-mean_absolute_error)
	26.59s	 = Training   

Fitting model: WeightedEnsemble_L3 ... Training model for up to 415.16s of the 616.38s of remaining time.
	-18.6505	 = Validation score   (-mean_absolute_error)
	0.37s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6584.03s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutoGluonTesting/")


0       -0.287315
1       -0.364094
2       -0.159061
3        5.396346
4       53.828056
          ...    
1531    44.677242
1532    11.403269
1533     1.381363
1534     0.133834
1535     0.537060
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: best_quality_random_seed_30_B.csv


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 7200s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315261.27 GB / 618408.77 GB (51.0%)
Train Data Rows:    21032
Train Data Columns: 79
Tuning Data Rows:    886
Tuning Data Columns: 79
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	Label info (max, min, mean, stddev): (999.6, 0.0, 90.45849, 177.65934)
	If 'regression' is not the correct problem_type, please manually specify the problem_type param

0
2880
Total data points: 26095
Data points to be removed: 2110
1
1536
2
1536
3
1536


Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    120203.18 MB
	Train Data (Original)  Memory Usage: 7.97 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 37 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', [])    : 31 | ['hour_0', 'hour_1', 'hour_10', 'hour_11', 'hour_12', ...]
		('object', [

	-20.4796	 = Validation score   (-mean_absolute_error)
	21.51s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 3958.56s of the 3958.54s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-20.0492	 = Validation score   (-mean_absolute_error)
	162.41s	 = Training   runtime
	1.02s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 3793.1s of the 3793.08s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
	-20.9887	 = Validation score   (-mean_absolute_error)
	49.16s	 = Training   runtime
	0.45s	 = Validation runtime
Repeating k-fold bagging: 2/20
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 3740.94s of the 3740.92s of remaining time.
	Fitting 8 child models (S2F1 - S2F8) | Fitting with ParallelLocalFoldFittingStrategy
	-21.4183	 = Validation score   (-mean_absolu

Fitting model: NeuralNetFastAI_BAG_L2 ... Training model for up to 1361.3s of the 1361.28s of remaining time.
	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-22.3057	 = Validation score   (-mean_absolute_error)
	2463.09s	 = Training   runtime
	8.89s	 = Validation runtime
Fitting model: XGBoost_BAG_L2 ... Training model for up to 857.94s of the 857.91s of remaining time.
	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-20.5024	 = Validation score   (-mean_absolute_error)
	111.78s	 = Training   runtime
	0.96s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L2 ... Training model for up to 830.86s of the 830.84s of remaining time.
	Fitting 8 child models (S5F1 - S5F8) | Fitting with ParallelLocalFoldFittingStrategy
	-20.0961	 = Validation score   (-mean_absolute_error)
	835.12s	 = Training   runtime
	7.13s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L2 ... Training model for up to 666.03s of t

0       5.469532e-08
1       1.498786e-08
2       2.408760e-08
3       1.245885e+00
4       3.618411e+01
            ...     
1531    5.507752e+01
1532    1.046285e+01
1533    5.757757e+00
1534    6.649827e-05
1535    2.367236e-07
Name: pv_measurement, Length: 1536, dtype: float32
Saved this file: best_quality_random_seed_30_C.csv
Done with run with percentage tuning= 30


In [12]:
import pandas as pd

file_name = 'best_quality_random_seed_'

pred_a = pd.read_csv(file_name + 'A.csv')
pred_a['date'] = pd.to_datetime(pred_a['date_forecast'])
pred_b = pd.read_csv(file_name + 'B.csv')
pred_b['date'] = pd.to_datetime(pred_b['date_forecast'])
pred_c = pd.read_csv(file_name + 'C.csv')
pred_c['date'] = pd.to_datetime(pred_c['date_forecast'])

test = pd.read_csv('data/test.csv')
test['time'] = pd.to_datetime(test['time'])

submission = pd.DataFrame(columns=['prediction'])

for val in pred_a['date']:
    if test['time'].eq(val).any():
        row = pred_a.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_b['date']:
    if test['time'].eq(val).any():
        row = pred_b.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_c['date']:
    if test['time'].eq(val).any():
        row = pred_c.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

submission['prediction'] = submission['prediction'].where(submission['prediction'] >= 0, 0)
submission.index.name = "id"

submission.to_csv('submission_' + file_name + 'attempt.csv')
