# IV. FORECASTING MODEL TUNING

We re-evaluate our selections of machine learning on our new data from April 2022 to June 2023 that contains both cloud mask and overview RGB images from the Himawari- 8 satellite. 

- Section IV-A covers the selection of model candidates and features.

- The input for this notebook is `dataset_realistic_cm_r_5x5.csv`, which contains data on cloud extraction and clear-sky irradiance for the specified date, time, and site.
- The outputs of this notebook are the estimated irradiance values from each model, including three models:
    - Extra Trees model: `Ihat_et_test_set.csv`
    - LightGBM model: `Ihat_lightgbm_v2_test_set.csv`
    - Random Forest model: `Ihat_rf_test_set.csv`

In [23]:
# import data 
import pandas as pd

df = pd.read_csv('dataset_realistic_cm_r_5x5.csv', parse_dates=['Datetime'])
df['site_name'] = df['site_name'].astype(str).str.zfill(2)
df['date'] = df['Datetime'].dt.date.astype(str)
df['date'] = df['date'] +" "+ df['site_name']
df['k'] = df['I'] / df['Iclr']
df.set_index('Datetime', inplace=True)
df

Unnamed: 0_level_0,site_name,I,Iclr,latt,long,CI,CI_1,CI_2,CI_3,CI_4,...,R_18,R_19,R_20,R_21,R_22,R_23,R_24,R_25,date,k
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-04-01 07:30:00+07:00,01,235.57840,265.993700,14.00523,100.519403,14.0,20.0,15.0,15.0,20.0,...,82.0,76.0,77.0,94.0,81.0,79.0,75.0,76.0,2022-04-01 01,0.885654
2022-04-01 07:45:00+07:00,01,322.49360,334.819476,14.00523,100.519403,14.0,20.0,15.0,15.0,20.0,...,82.0,76.0,77.0,94.0,81.0,79.0,75.0,76.0,2022-04-01 01,0.963187
2022-04-01 08:00:00+07:00,01,357.68410,402.581345,14.00523,100.519403,13.0,12.0,15.0,17.0,21.0,...,75.0,76.0,78.0,79.0,76.0,76.0,75.0,77.0,2022-04-01 01,0.888477
2022-04-01 08:15:00+07:00,01,413.50590,468.732095,14.00523,100.519403,13.0,12.0,15.0,17.0,21.0,...,75.0,76.0,78.0,79.0,76.0,76.0,75.0,77.0,2022-04-01 01,0.882180
2022-04-01 08:30:00+07:00,01,471.19330,532.843024,14.00523,100.519403,13.0,12.0,15.0,17.0,21.0,...,75.0,76.0,78.0,79.0,76.0,76.0,75.0,77.0,2022-04-01 01,0.884300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-30 16:00:00+07:00,56,91.85729,666.692427,13.50130,100.135400,211.0,218.0,230.0,229.0,229.0,...,153.0,153.0,153.0,153.0,147.0,151.0,156.0,158.0,2023-06-30 56,0.137781
2023-06-30 16:15:00+07:00,56,107.00000,609.298561,13.50130,100.135400,211.0,218.0,230.0,229.0,229.0,...,153.0,153.0,153.0,153.0,147.0,151.0,156.0,158.0,2023-06-30 56,0.175612
2023-06-30 16:30:00+07:00,56,113.72660,549.564992,13.50130,100.135400,211.0,218.0,230.0,229.0,229.0,...,153.0,153.0,153.0,153.0,147.0,151.0,156.0,158.0,2023-06-30 56,0.206939
2023-06-30 16:45:00+07:00,56,108.15760,487.772006,13.50130,100.135400,211.0,218.0,230.0,229.0,229.0,...,153.0,153.0,153.0,153.0,147.0,151.0,156.0,158.0,2023-06-30 56,0.221738


In [24]:
# create new features and encode them

df = df.between_time('07:00', '17:00')
df['hour'] = df.index.hour
df['hour_encode1'] = df['hour'] - 11.5
df['hour_encode2'] = (df['hour'] - 7)/(17-7)
df['day'] = df.index.dayofyear
df['month'] = df.index.month
df.reset_index(inplace=True)
df

Unnamed: 0,Datetime,site_name,I,Iclr,latt,long,CI,CI_1,CI_2,CI_3,...,R_23,R_24,R_25,date,k,hour,hour_encode1,hour_encode2,day,month
0,2022-04-01 07:30:00+07:00,01,235.57840,265.993700,14.00523,100.519403,14.0,20.0,15.0,15.0,...,79.0,75.0,76.0,2022-04-01 01,0.885654,7,-4.5,0.0,91,4
1,2022-04-01 07:45:00+07:00,01,322.49360,334.819476,14.00523,100.519403,14.0,20.0,15.0,15.0,...,79.0,75.0,76.0,2022-04-01 01,0.963187,7,-4.5,0.0,91,4
2,2022-04-01 08:00:00+07:00,01,357.68410,402.581345,14.00523,100.519403,13.0,12.0,15.0,17.0,...,76.0,75.0,77.0,2022-04-01 01,0.888477,8,-3.5,0.1,91,4
3,2022-04-01 08:15:00+07:00,01,413.50590,468.732095,14.00523,100.519403,13.0,12.0,15.0,17.0,...,76.0,75.0,77.0,2022-04-01 01,0.882180,8,-3.5,0.1,91,4
4,2022-04-01 08:30:00+07:00,01,471.19330,532.843024,14.00523,100.519403,13.0,12.0,15.0,17.0,...,76.0,75.0,77.0,2022-04-01 01,0.884300,8,-3.5,0.1,91,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867808,2023-06-30 16:00:00+07:00,56,91.85729,666.692427,13.50130,100.135400,211.0,218.0,230.0,229.0,...,151.0,156.0,158.0,2023-06-30 56,0.137781,16,4.5,0.9,181,6
867809,2023-06-30 16:15:00+07:00,56,107.00000,609.298561,13.50130,100.135400,211.0,218.0,230.0,229.0,...,151.0,156.0,158.0,2023-06-30 56,0.175612,16,4.5,0.9,181,6
867810,2023-06-30 16:30:00+07:00,56,113.72660,549.564992,13.50130,100.135400,211.0,218.0,230.0,229.0,...,151.0,156.0,158.0,2023-06-30 56,0.206939,16,4.5,0.9,181,6
867811,2023-06-30 16:45:00+07:00,56,108.15760,487.772006,13.50130,100.135400,211.0,218.0,230.0,229.0,...,151.0,156.0,158.0,2023-06-30 56,0.221738,16,4.5,0.9,181,6


In [25]:
import datetime
import numpy as np

# We will use the sine and cosine transformations of the timestamp to encode the cyclical nature of the time of day.
timestamp_s = pd.to_datetime(df['Datetime']).map(datetime.datetime.timestamp)

# The timestamp is in seconds, so we must calculate the number of seconds in a day before applying the sine transformation.
day = 24*60*60 
df['day_sin'] = (np.sin(timestamp_s * (2*np.pi/day))).values
df['day_cos'] = (np.cos(timestamp_s * (2*np.pi/day))).values
df

Unnamed: 0,Datetime,site_name,I,Iclr,latt,long,CI,CI_1,CI_2,CI_3,...,R_25,date,k,hour,hour_encode1,hour_encode2,day,month,day_sin,day_cos
0,2022-04-01 07:30:00+07:00,01,235.57840,265.993700,14.00523,100.519403,14.0,20.0,15.0,15.0,...,76.0,2022-04-01 01,0.885654,7,-4.5,0.0,91,4,0.130526,0.991445
1,2022-04-01 07:45:00+07:00,01,322.49360,334.819476,14.00523,100.519403,14.0,20.0,15.0,15.0,...,76.0,2022-04-01 01,0.963187,7,-4.5,0.0,91,4,0.195090,0.980785
2,2022-04-01 08:00:00+07:00,01,357.68410,402.581345,14.00523,100.519403,13.0,12.0,15.0,17.0,...,77.0,2022-04-01 01,0.888477,8,-3.5,0.1,91,4,0.258819,0.965926
3,2022-04-01 08:15:00+07:00,01,413.50590,468.732095,14.00523,100.519403,13.0,12.0,15.0,17.0,...,77.0,2022-04-01 01,0.882180,8,-3.5,0.1,91,4,0.321439,0.946930
4,2022-04-01 08:30:00+07:00,01,471.19330,532.843024,14.00523,100.519403,13.0,12.0,15.0,17.0,...,77.0,2022-04-01 01,0.884300,8,-3.5,0.1,91,4,0.382683,0.923880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867808,2023-06-30 16:00:00+07:00,56,91.85729,666.692427,13.50130,100.135400,211.0,218.0,230.0,229.0,...,158.0,2023-06-30 56,0.137781,16,4.5,0.9,181,6,0.707107,-0.707107
867809,2023-06-30 16:15:00+07:00,56,107.00000,609.298561,13.50130,100.135400,211.0,218.0,230.0,229.0,...,158.0,2023-06-30 56,0.175612,16,4.5,0.9,181,6,0.659346,-0.751840
867810,2023-06-30 16:30:00+07:00,56,113.72660,549.564992,13.50130,100.135400,211.0,218.0,230.0,229.0,...,158.0,2023-06-30 56,0.206939,16,4.5,0.9,181,6,0.608761,-0.793353
867811,2023-06-30 16:45:00+07:00,56,108.15760,487.772006,13.50130,100.135400,211.0,218.0,230.0,229.0,...,158.0,2023-06-30 56,0.221738,16,4.5,0.9,181,6,0.555570,-0.831470


In [None]:
# split the data into train and test sets
from utils import SolarSplitter

train_df, test_df, train_info_df, test_info_df, cluster_info_df = SolarSplitter.split_train_test_sky_condition(df, train_frac=0.6, random_state=42)

In [26]:
# load the train and test sets that we have saved
train_df = pd.read_csv('/Users/nuttamon/Documents/project/solar_map/data/train_df.csv')
test_df = pd.read_csv('/Users/nuttamon/Documents/project/solar_map/data/test_df.csv')

In [27]:
train = train_df[['Iclr', 'CI', 'R', 'hour_encode1', 'latt' , 'day', 'month', 'long', 'I']]
train

Unnamed: 0,Iclr,CI,R,hour_encode1,latt,day,month,long,I
0,265.993700,14.0,78.0,-4.5,14.00523,91,4,100.519403,235.57840
1,334.819476,14.0,78.0,-4.5,14.00523,91,4,100.519403,322.49360
2,402.581345,13.0,74.0,-3.5,14.00523,91,4,100.519403,357.68410
3,468.732095,13.0,74.0,-3.5,14.00523,91,4,100.519403,413.50590
4,532.843024,13.0,74.0,-3.5,14.00523,91,4,100.519403,471.19330
...,...,...,...,...,...,...,...,...,...
520685,666.692427,211.0,158.0,4.5,13.50130,181,6,100.135400,91.85729
520686,609.298561,211.0,158.0,4.5,13.50130,181,6,100.135400,107.00000
520687,549.564992,211.0,158.0,4.5,13.50130,181,6,100.135400,113.72660
520688,487.772006,211.0,158.0,4.5,13.50130,181,6,100.135400,108.15760


## Train AutoML

In [28]:
import pycaret
from pycaret.regression import *


# Set display results
pd.options.display.float_format = '{:,.4f}'.format
%config InlineBackend.figure_format = 'retina'

# PyCaret version
print(f'PyCaret version: {pycaret.__version__}')

PyCaret version: 3.1.0


In [29]:
# Set up the PyCaret environment
regression = setup(
    data=train, 
    target='I',  # Specify 'I' as the target column
    train_size=0.8,
    numeric_features=['Iclr', 'CI', 'R', 'hour_encode1', 'latt' ,'long'],
    numeric_imputation='mean',  # For deployment purposes
    normalize=True,
    normalize_method='minmax',
    polynomial_features=True,
    polynomial_degree=2,
    use_gpu=False,
    fold=5,
    n_jobs=-1,
)

Unnamed: 0,Description,Value
0,Session id,6597
1,Target,I
2,Target type,Regression
3,Original data shape,"(520690, 9)"
4,Transformed data shape,"(520690, 45)"
5,Transformed train set shape,"(416552, 45)"
6,Transformed test set shape,"(104138, 45)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


In [None]:
# Compare models
estimatedModels = compare_models(sort = 'MAE', n_select = 10)

In [None]:
# Print the estimated models
print(estimatedModels)

In [None]:
# Plot the estimated models
plot_model(estimatedModels[0], plot = 'feature')

In [None]:
# Create a model
model = create_model('rf')

In [None]:
# Tune the model 

custom_grid = {
    'max_depth': [None, 5,10,20],  
    'max_features': [1,4,6,12],  
    'min_samples_leaf': [1,10,20,50]
}

# Tune the model with the custom grid
tuned_rf = tune_model(model, custom_grid=custom_grid)

In [None]:
# Print the tuned model
tuned_rf.get_params()

In [None]:
# Plot the tuned model
finalModel_rf = finalize_model(tuned_rf)

In [None]:
# save model
save_model(finalModel_rf, '/mode_tuned/model_rf')

In [None]:

# Save results
  
test = test_df[['Iclr', 'CI', 'R', 'hour_encode1','latt' ,'long', 'day', 'month']].copy()
test_df['Datetime'] = pd.to_datetime(test_df['Datetime'])

test['I_hat'] = finalModel_rf.predict(test[['Iclr', 'CI', 'R', 'hour_encode1','latt' ,'long', 'day', 'month']])
test['site_name'] = test_df['site_name']
test['Datetime'] = test_df.Datetime
test['I'] = test_df.I
test.set_index('Datetime', inplace=True)
test['Date'] = test.index.date.astype(str)
test['hour'] = test.index.hour

test.to_csv('Ihat_rf_test_set.csv')