# Evaluation of machine learning models

Overall deterministic performance is evaluated by the root mean square error (RMSE) between the estimated irradiance and measurements. We benchmark top three models against each other, i.e., extra tree, random forest, and lightGBM, on a clear sky, partly cloudy, and cloudy sky conditions, and in midday and morning and evening data

- The inputs files of this notebook are the estimated irradiance values from each model, including three models:
    - Extra Trees model: `Ihat_et_test_set.csv`
    - LightGBM model: `Ihat_lightgbm_v2_test_set.csv`
    - Random Forest model: `Ihat_rf_test_set.csv`

## Error metric of extra tree model

In [2]:
import pandas as pd

et = pd.read_csv('exp_results/Ihat_et_test_set.csv', parse_dates=['Datetime'], usecols=lambda x: x not in ['condition', 'hour_encode1','day','month'])
et['k'] = et['I'] / et['Iclr']
et['CI'] = et['CI']/255
et['R'] = et['R']/255

# create K_bar in each day and each site
kmean = et[['site_name','Date','k']].groupby(by=[et.Date,'site_name']).mean(numeric_only=True)
kmean = kmean.reset_index(level=1)
kmean.reset_index(inplace=True)
kmean = kmean.rename(columns={'index':'Date','k':'k_bar'})
kmean = pd.merge(et,kmean,on=['Date','site_name'],how='left')
kmean.index = et.index

et['k_bar'] = kmean.k_bar
et['sky_condition'] = et['k_bar'].apply(lambda x: 'cloudy' if x < 0.3 else ('partly_cloudy' if x < 0.6 else 'clear'))
et['R_clusters'] = et['R'].apply(lambda x: 'Low cloud' if x < 0.3 else ('Medium cloud' if x < 0.6 else 'High cloud'))
et['hour_clusters'] = et['hour'].apply(lambda x: 'midday' if 10 <= x <= 15 else 'non-midday')
et

Unnamed: 0,Datetime,Iclr,CI,R,latt,long,I_hat,site_name,I,Date,hour,k,k_bar,sky_condition,R_clusters,hour_clusters
0,2022-04-05 07:00:00+07:00,141.241660,0.090196,0.203922,14.00523,100.519403,124.756089,ISL001,183.5975,2022-04-05,7,1.299882,0.695748,clear,Low cloud,non-midday
1,2022-04-05 07:15:00+07:00,209.518060,0.090196,0.203922,14.00523,100.519403,167.423803,ISL001,240.5533,2022-04-05,7,1.148127,0.695748,clear,Low cloud,non-midday
2,2022-04-05 07:30:00+07:00,278.611744,0.141176,0.298039,14.00523,100.519403,219.568249,ISL001,164.2464,2022-04-05,7,0.589517,0.695748,clear,Low cloud,non-midday
3,2022-04-05 07:45:00+07:00,347.292820,0.141176,0.298039,14.00523,100.519403,274.106907,ISL001,176.0619,2022-04-05,7,0.506955,0.695748,clear,Low cloud,non-midday
4,2022-04-05 08:00:00+07:00,414.830234,0.172549,0.576471,14.00523,100.519403,216.567147,ISL001,261.4042,2022-04-05,8,0.630147,0.695748,clear,Medium cloud,non-midday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347118,2023-06-22 16:00:00+07:00,660.558000,0.023529,0.282353,13.50130,100.135400,445.418153,ISL056,473.4280,2023-06-22,16,0.716709,0.742820,clear,Low cloud,non-midday
347119,2023-06-22 16:15:00+07:00,603.006077,0.023529,0.282353,13.50130,100.135400,414.548652,ISL056,418.3324,2023-06-22,16,0.693745,0.742820,clear,Low cloud,non-midday
347120,2023-06-22 16:30:00+07:00,543.148095,0.023529,0.235294,13.50130,100.135400,403.044291,ISL056,367.2727,2023-06-22,16,0.676193,0.742820,clear,Low cloud,non-midday
347121,2023-06-22 16:45:00+07:00,481.265865,0.023529,0.235294,13.50130,100.135400,365.116572,ISL056,311.9268,2023-06-22,16,0.648138,0.742820,clear,Low cloud,non-midday


In [3]:
# calculate RMSE of I and I_hat in 3 sky condition and 3 hour_clusters

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

et_midday_clear = et[(et['sky_condition'] == 'clear') & (et['hour_clusters'] == 'midday')]
et_midday_partly_cloudy = et[(et['sky_condition'] == 'partly_cloudy') & (et['hour_clusters'] == 'midday')]
et_midday_cloudy = et[(et['sky_condition'] == 'cloudy') & (et['hour_clusters'] == 'midday')]

et_non_midday_clear = et[(et['sky_condition'] == 'clear') & (et['hour_clusters'] == 'non-midday')]
et_non_midday_partly_cloudy = et[(et['sky_condition'] == 'partly_cloudy') & (et['hour_clusters'] == 'non-midday')]
et_non_midday_cloudy = et[(et['sky_condition'] == 'cloudy') & (et['hour_clusters'] == 'non-midday')]

rmse_et_midday_clear = sqrt(mean_squared_error(et_midday_clear['I'],et_midday_clear['I_hat']))
rmse_et_midday_partly_cloudy = sqrt(mean_squared_error(et_midday_partly_cloudy['I'],et_midday_partly_cloudy['I_hat']))
rmse_et_midday_cloudy = sqrt(mean_squared_error(et_midday_cloudy['I'],et_midday_cloudy['I_hat']))

rmse_et_non_midday_clear = sqrt(mean_squared_error(et_non_midday_clear['I'],et_non_midday_clear['I_hat']))
rmse_et_non_midday_partly_cloudy = sqrt(mean_squared_error(et_non_midday_partly_cloudy['I'],et_non_midday_partly_cloudy['I_hat']))
rmse_et_non_midday_cloudy = sqrt(mean_squared_error(et_non_midday_cloudy['I'],et_non_midday_cloudy['I_hat']))

# print RMSE of I and I_hat in 3 sky condition and 3 hour_clusters
print()
print('RMSE midday clear = ',rmse_et_midday_clear)
print('RMSE midday partly cloudy = ',rmse_et_midday_partly_cloudy)
print('RMSE midday cloudy = ',rmse_et_midday_cloudy)
print('RMSE non-midday clear = ',rmse_et_non_midday_clear)
print('RMSE non-midday partly cloudy = ',rmse_et_non_midday_partly_cloudy)
print('RMSE non-midday cloudy = ',rmse_et_non_midday_cloudy)

# overall RMSE, MAE
rmse_et = sqrt(mean_squared_error(et['I'],et['I_hat']))
mae_et = mean_absolute_error(et['I'],et['I_hat'])
print()
print('Overall RMSE, MAE')
print('RMSE = ',rmse_et)
print('MAE = ',mae_et)

# overall RMSE, MAE by hour_clusters
rmse_et_midday = sqrt(mean_squared_error(et[et['hour_clusters'] == 'midday']['I'],et[et['hour_clusters'] == 'midday']['I_hat']))
mae_et_midday = mean_absolute_error(et[et['hour_clusters'] == 'midday']['I'],et[et['hour_clusters'] == 'midday']['I_hat'])
rmse_et_non_midday = sqrt(mean_squared_error(et[et['hour_clusters'] == 'non-midday']['I'],et[et['hour_clusters'] == 'non-midday']['I_hat']))
mae_et_non_midday = mean_absolute_error(et[et['hour_clusters'] == 'non-midday']['I'],et[et['hour_clusters'] == 'non-midday']['I_hat'])
print()
print('Overall RMSE, MAE by hour_clusters')
print('RMSE midday = ',rmse_et_midday)
print('MAE midday = ',mae_et_midday)
print('RMSE non-midday = ',rmse_et_non_midday)
print('MAE non-midday = ',mae_et_non_midday)

# overall RMSE, MAE by sky_condition
rmse_et_clear = sqrt(mean_squared_error(et[et['sky_condition'] == 'clear']['I'],et[et['sky_condition'] == 'clear']['I_hat']))
mae_et_clear = mean_absolute_error(et[et['sky_condition'] == 'clear']['I'],et[et['sky_condition'] == 'clear']['I_hat'])
rmse_et_partly_cloudy = sqrt(mean_squared_error(et[et['sky_condition'] == 'partly_cloudy']['I'],et[et['sky_condition'] == 'partly_cloudy']['I_hat']))
mae_et_partly_cloudy = mean_absolute_error(et[et['sky_condition'] == 'partly_cloudy']['I'],et[et['sky_condition'] == 'partly_cloudy']['I_hat'])
rmse_et_cloudy = sqrt(mean_squared_error(et[et['sky_condition'] == 'cloudy']['I'],et[et['sky_condition'] == 'cloudy']['I_hat']))
mae_et_cloudy = mean_absolute_error(et[et['sky_condition'] == 'cloudy']['I'],et[et['sky_condition'] == 'cloudy']['I_hat'])
print()
print('Overall RMSE, MAE by sky_condition')
print('RMSE clear = ',rmse_et_clear)
print('MAE clear = ',mae_et_clear)
print('RMSE partly cloudy = ',rmse_et_partly_cloudy)
print('MAE partly cloudy = ',mae_et_partly_cloudy)
print('RMSE cloudy = ',rmse_et_cloudy)
print('MAE cloudy = ',mae_et_cloudy)



RMSE midday clear =  142.25515294793243
RMSE midday partly cloudy =  188.78495147254395
RMSE midday cloudy =  226.36316277723154
RMSE non-midday clear =  78.89420615092045
RMSE non-midday partly cloudy =  96.8551402828126
RMSE non-midday cloudy =  125.359426171013

Overall RMSE, MAE
RMSE =  134.17406655482537
MAE =  90.14430315944504

Overall RMSE, MAE by hour_clusters
RMSE midday =  159.81484116390953
MAE midday =  112.49392621181937
RMSE non-midday =  86.05814934016301
MAE non-midday =  58.726361097296895

Overall RMSE, MAE by sky_condition
RMSE clear =  120.04872898384207
MAE clear =  79.30483388401967
RMSE partly cloudy =  157.24204677045432
MAE partly cloudy =  112.51805199840759
RMSE cloudy =  191.02152653298117
MAE cloudy =  126.51432387790007


## Error metric of lightGBM model

In [4]:
lightgbm = pd.read_csv('exp_results/Ihat_lightgbm_v2_test_set.csv', parse_dates=['Datetime'], usecols=lambda x: x not in ['condition', 'hour_encode1','day','month'])
lightgbm['k'] = lightgbm['I'] / lightgbm['Iclr']
lightgbm['CI'] = lightgbm['CI']/255
lightgbm['R'] = lightgbm['R']/255

# create K_bar in each day and each site
kmean = lightgbm[['site_name','Date','k']].groupby(by=[lightgbm.Date,'site_name']).mean(numeric_only=True)
kmean = kmean.reset_index(level=1)
kmean.reset_index(inplace=True)
kmean = kmean.rename(columns={'index':'Date','k':'k_bar'})
kmean = pd.merge(lightgbm,kmean,on=['Date','site_name'],how='left')
kmean.index = lightgbm.index

lightgbm['k_bar'] = kmean.k_bar
lightgbm['sky_condition'] = lightgbm['k_bar'].apply(lambda x: 'cloudy' if x < 0.3 else ('partly_cloudy' if x < 0.6 else 'clear'))
lightgbm['R_clusters'] = lightgbm['R'].apply(lambda x: 'Low cloud' if x < 0.3 else ('Medium cloud' if x < 0.6 else 'High cloud'))
lightgbm['hour_clusters'] = lightgbm['hour'].apply(lambda x: 'midday' if 10 <= x <= 15 else 'non-midday')
lightgbm

Unnamed: 0,Datetime,Iclr,CI,R,latt,long,I_hat,site_name,I,Date,hour,k,k_bar,sky_condition,R_clusters,hour_clusters
0,2022-04-05 07:00:00+07:00,141.241660,0.090196,0.203922,14.00523,100.519403,135.068489,ISL001,183.5975,2022-04-05,7,1.299882,0.695748,clear,Low cloud,non-midday
1,2022-04-05 07:15:00+07:00,209.518060,0.090196,0.203922,14.00523,100.519403,176.325925,ISL001,240.5533,2022-04-05,7,1.148127,0.695748,clear,Low cloud,non-midday
2,2022-04-05 07:30:00+07:00,278.611744,0.141176,0.298039,14.00523,100.519403,220.197368,ISL001,164.2464,2022-04-05,7,0.589517,0.695748,clear,Low cloud,non-midday
3,2022-04-05 07:45:00+07:00,347.292820,0.141176,0.298039,14.00523,100.519403,284.082731,ISL001,176.0619,2022-04-05,7,0.506955,0.695748,clear,Low cloud,non-midday
4,2022-04-05 08:00:00+07:00,414.830234,0.172549,0.576471,14.00523,100.519403,241.438227,ISL001,261.4042,2022-04-05,8,0.630147,0.695748,clear,Medium cloud,non-midday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347118,2023-06-22 16:00:00+07:00,660.558000,0.023529,0.282353,13.50130,100.135400,477.429517,ISL056,473.4280,2023-06-22,16,0.716709,0.742820,clear,Low cloud,non-midday
347119,2023-06-22 16:15:00+07:00,603.006077,0.023529,0.282353,13.50130,100.135400,439.760874,ISL056,418.3324,2023-06-22,16,0.693745,0.742820,clear,Low cloud,non-midday
347120,2023-06-22 16:30:00+07:00,543.148095,0.023529,0.235294,13.50130,100.135400,404.229601,ISL056,367.2727,2023-06-22,16,0.676193,0.742820,clear,Low cloud,non-midday
347121,2023-06-22 16:45:00+07:00,481.265865,0.023529,0.235294,13.50130,100.135400,333.382894,ISL056,311.9268,2023-06-22,16,0.648138,0.742820,clear,Low cloud,non-midday


In [5]:
# calculate RMSE of I and I_hat in 3 sky condition and 3 hour_clusters

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

lightgbm_midday_clear = lightgbm[(lightgbm['sky_condition'] == 'clear') & (lightgbm['hour_clusters'] == 'midday')]
lightgbm_midday_partly_cloudy = lightgbm[(lightgbm['sky_condition'] == 'partly_cloudy') & (lightgbm['hour_clusters'] == 'midday')]
lightgbm_midday_cloudy = lightgbm[(lightgbm['sky_condition'] == 'cloudy') & (lightgbm['hour_clusters'] == 'midday')]

lightgbm_non_midday_clear = lightgbm[(lightgbm['sky_condition'] == 'clear') & (lightgbm['hour_clusters'] == 'non-midday')]
lightgbm_non_midday_partly_cloudy = lightgbm[(lightgbm['sky_condition'] == 'partly_cloudy') & (lightgbm['hour_clusters'] == 'non-midday')]
lightgbm_non_midday_cloudy = lightgbm[(lightgbm['sky_condition'] == 'cloudy') & (lightgbm['hour_clusters'] == 'non-midday')]

rmse_lightgbm_midday_clear = sqrt(mean_squared_error(lightgbm_midday_clear['I'],lightgbm_midday_clear['I_hat']))
rmse_lightgbm_midday_partly_cloudy = sqrt(mean_squared_error(lightgbm_midday_partly_cloudy['I'],lightgbm_midday_partly_cloudy['I_hat']))
rmse_lightgbm_midday_cloudy = sqrt(mean_squared_error(lightgbm_midday_cloudy['I'],lightgbm_midday_cloudy['I_hat']))

rmse_lightgbm_non_midday_clear = sqrt(mean_squared_error(lightgbm_non_midday_clear['I'],lightgbm_non_midday_clear['I_hat']))
rmse_lightgbm_non_midday_partly_cloudy = sqrt(mean_squared_error(lightgbm_non_midday_partly_cloudy['I'],lightgbm_non_midday_partly_cloudy['I_hat']))
rmse_lightgbm_non_midday_cloudy = sqrt(mean_squared_error(lightgbm_non_midday_cloudy['I'],lightgbm_non_midday_cloudy['I_hat']))

# print RMSE of I and I_hat in 3 sky condition and 3 hour_clusters
print('RMSE midday clear = ',rmse_lightgbm_midday_clear)
print('RMSE midday partly cloudy = ',rmse_lightgbm_midday_partly_cloudy)
print('RMSE midday cloudy = ',rmse_lightgbm_midday_cloudy)
print('RMSE non-midday clear = ',rmse_lightgbm_non_midday_clear)
print('RMSE non-midday partly cloudy = ',rmse_lightgbm_non_midday_partly_cloudy)
print('RMSE non-midday cloudy = ',rmse_lightgbm_non_midday_cloudy)

# overall RMSE, MAE
rmse_lightgbm = sqrt(mean_squared_error(lightgbm['I'],lightgbm['I_hat']))
mae_lightgbm = mean_absolute_error(lightgbm['I'],lightgbm['I_hat'])
print()
print('overall RMSE, MAE')

print('RMSE = ',rmse_lightgbm)
print('MAE = ',mae_lightgbm)

# overall RMSE, MAE by hour_clusters
rmse_lightgbm_midday = sqrt(mean_squared_error(lightgbm[lightgbm['hour_clusters'] == 'midday']['I'],lightgbm[lightgbm['hour_clusters'] == 'midday']['I_hat']))
mae_lightgbm_midday = mean_absolute_error(lightgbm[lightgbm['hour_clusters'] == 'midday']['I'],lightgbm[lightgbm['hour_clusters'] == 'midday']['I_hat'])
rmse_lightgbm_non_midday = sqrt(mean_squared_error(lightgbm[lightgbm['hour_clusters'] == 'non-midday']['I'],lightgbm[lightgbm['hour_clusters'] == 'non-midday']['I_hat']))
mae_lightgbm_non_midday = mean_absolute_error(lightgbm[lightgbm['hour_clusters'] == 'non-midday']['I'],lightgbm[lightgbm['hour_clusters'] == 'non-midday']['I_hat'])
print()
print('overall RMSE, MAE by hour_clusters')

print('RMSE midday = ',rmse_lightgbm_midday)
print('MAE midday = ',mae_lightgbm_midday)
print('RMSE non-midday = ',rmse_lightgbm_non_midday)
print('MAE non-midday = ',mae_lightgbm_non_midday)

# overall RMSE, MAE by sky_condition
rmse_lightgbm_clear = sqrt(mean_squared_error(lightgbm[lightgbm['sky_condition'] == 'clear']['I'],lightgbm[lightgbm['sky_condition'] == 'clear']['I_hat']))
mae_lightgbm_clear = mean_absolute_error(lightgbm[lightgbm['sky_condition'] == 'clear']['I'],lightgbm[lightgbm['sky_condition'] == 'clear']['I_hat'])
rmse_lightgbm_partly_cloudy = sqrt(mean_squared_error(lightgbm[lightgbm['sky_condition'] == 'partly_cloudy']['I'],lightgbm[lightgbm['sky_condition'] == 'partly_cloudy']['I_hat']))
mae_lightgbm_partly_cloudy = mean_absolute_error(lightgbm[lightgbm['sky_condition'] == 'partly_cloudy']['I'],lightgbm[lightgbm['sky_condition'] == 'partly_cloudy']['I_hat'])
rmse_lightgbm_cloudy = sqrt(mean_squared_error(lightgbm[lightgbm['sky_condition'] == 'cloudy']['I'],lightgbm[lightgbm['sky_condition'] == 'cloudy']['I_hat']))
mae_lightgbm_cloudy = mean_absolute_error(lightgbm[lightgbm['sky_condition'] == 'cloudy']['I'],lightgbm[lightgbm['sky_condition'] == 'cloudy']['I_hat'])

print()
print('overall RMSE, MAE by sky_condition')

print('RMSE clear = ',rmse_lightgbm_clear)
print('MAE clear = ',mae_lightgbm_clear)
print('RMSE partly cloudy = ',rmse_lightgbm_partly_cloudy)
print('MAE partly cloudy = ',mae_lightgbm_partly_cloudy)
print('RMSE cloudy = ',rmse_lightgbm_cloudy)
print('MAE cloudy = ',mae_lightgbm_cloudy)


RMSE midday clear =  139.78386667620913
RMSE midday partly cloudy =  189.0837448081324
RMSE midday cloudy =  203.82830784376728
RMSE non-midday clear =  78.98069456466712
RMSE non-midday partly cloudy =  96.22774894422673
RMSE non-midday cloudy =  118.41907550917364

overall RMSE, MAE
RMSE =  132.43146811693327
MAE =  89.88685176924565

overall RMSE, MAE by hour_clusters
RMSE midday =  157.47993395753204
MAE midday =  111.67318148227707
RMSE non-midday =  85.6141411952085
MAE non-midday =  59.26075829702664

overall RMSE, MAE by sky_condition
RMSE clear =  118.36432781788507
MAE clear =  78.84408417337099
RMSE partly cloudy =  157.29171625318327
MAE partly cloudy =  114.03571536444315
RMSE cloudy =  173.54460632362475
MAE cloudy =  114.28822574008205


## Error metric of random forest model

In [6]:
rf = pd.read_csv('exp_results/Ihat_rf_test_set.csv', parse_dates=['Datetime'], usecols=lambda x: x not in ['condition', 'hour_encode1','day','month'])
rf['k'] = rf['I'] / rf['Iclr']
rf['CI'] = rf['CI']/255
rf['R'] = rf['R']/255

# create K_bar in each day and each site
kmean = rf[['site_name','Date','k']].groupby(by=[rf.Date,'site_name']).mean(numeric_only=True)
kmean = kmean.reset_index(level=1)
kmean.reset_index(inplace=True)
kmean = kmean.rename(columns={'index':'Date','k':'k_bar'})
kmean = pd.merge(rf,kmean,on=['Date','site_name'],how='left')
kmean.index = rf.index

rf['k_bar'] = kmean.k_bar
rf['sky_condition'] = rf['k_bar'].apply(lambda x: 'cloudy' if x < 0.3 else ('partly_cloudy' if x < 0.6 else 'clear'))
rf['R_clusters'] = rf['R'].apply(lambda x: 'Low cloud' if x < 0.3 else ('Medium cloud' if x < 0.6 else 'High cloud'))
rf['hour_clusters'] = rf['hour'].apply(lambda x: 'midday' if 10 <= x <= 15 else 'non-midday')
rf

Unnamed: 0,Datetime,Iclr,CI,R,latt,long,I_hat,site_name,I,Date,hour,k,k_bar,sky_condition,R_clusters,hour_clusters
0,2022-04-05 07:00:00+07:00,141.241660,0.090196,0.203922,14.00523,100.519403,131.874370,ISL001,183.5975,2022-04-05,7,1.299882,0.695748,clear,Low cloud,non-midday
1,2022-04-05 07:15:00+07:00,209.518060,0.090196,0.203922,14.00523,100.519403,171.974845,ISL001,240.5533,2022-04-05,7,1.148127,0.695748,clear,Low cloud,non-midday
2,2022-04-05 07:30:00+07:00,278.611744,0.141176,0.298039,14.00523,100.519403,211.763328,ISL001,164.2464,2022-04-05,7,0.589517,0.695748,clear,Low cloud,non-midday
3,2022-04-05 07:45:00+07:00,347.292820,0.141176,0.298039,14.00523,100.519403,271.764854,ISL001,176.0619,2022-04-05,7,0.506955,0.695748,clear,Low cloud,non-midday
4,2022-04-05 08:00:00+07:00,414.830234,0.172549,0.576471,14.00523,100.519403,217.313182,ISL001,261.4042,2022-04-05,8,0.630147,0.695748,clear,Medium cloud,non-midday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347118,2023-06-22 16:00:00+07:00,660.558000,0.023529,0.282353,13.50130,100.135400,453.160119,ISL056,473.4280,2023-06-22,16,0.716709,0.742820,clear,Low cloud,non-midday
347119,2023-06-22 16:15:00+07:00,603.006077,0.023529,0.282353,13.50130,100.135400,422.846866,ISL056,418.3324,2023-06-22,16,0.693745,0.742820,clear,Low cloud,non-midday
347120,2023-06-22 16:30:00+07:00,543.148095,0.023529,0.235294,13.50130,100.135400,406.170295,ISL056,367.2727,2023-06-22,16,0.676193,0.742820,clear,Low cloud,non-midday
347121,2023-06-22 16:45:00+07:00,481.265865,0.023529,0.235294,13.50130,100.135400,347.076506,ISL056,311.9268,2023-06-22,16,0.648138,0.742820,clear,Low cloud,non-midday


In [7]:
# calculate RMSE of I and I_hat in 3 sky condition and 3 hour_clusters

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

rf_midday_clear = rf[(rf['sky_condition'] == 'clear') & (rf['hour_clusters'] == 'midday')]
rf_midday_partly_cloudy = rf[(rf['sky_condition'] == 'partly_cloudy') & (rf['hour_clusters'] == 'midday')]
rf_midday_cloudy = rf[(rf['sky_condition'] == 'cloudy') & (rf['hour_clusters'] == 'midday')]

rf_non_midday_clear = rf[(rf['sky_condition'] == 'clear') & (rf['hour_clusters'] == 'non-midday')]
rf_non_midday_partly_cloudy = rf[(rf['sky_condition'] == 'partly_cloudy') & (rf['hour_clusters'] == 'non-midday')]
rf_non_midday_cloudy = rf[(rf['sky_condition'] == 'cloudy') & (rf['hour_clusters'] == 'non-midday')]

rmse_rf_midday_clear = sqrt(mean_squared_error(rf_midday_clear['I'],rf_midday_clear['I_hat']))
rmse_rf_midday_partly_cloudy = sqrt(mean_squared_error(rf_midday_partly_cloudy['I'],rf_midday_partly_cloudy['I_hat']))
rmse_rf_midday_cloudy = sqrt(mean_squared_error(rf_midday_cloudy['I'],rf_midday_cloudy['I_hat']))

rmse_rf_non_midday_clear = sqrt(mean_squared_error(rf_non_midday_clear['I'],rf_non_midday_clear['I_hat']))
rmse_rf_non_midday_partly_cloudy = sqrt(mean_squared_error(rf_non_midday_partly_cloudy['I'],rf_non_midday_partly_cloudy['I_hat']))
rmse_rf_non_midday_cloudy = sqrt(mean_squared_error(rf_non_midday_cloudy['I'],rf_non_midday_cloudy['I_hat']))

# print RMSE of I and I_hat in 3 sky condition and 3 hour_clusters
print('RMSE midday clear = ',rmse_rf_midday_clear)
print('RMSE midday partly cloudy = ',rmse_rf_midday_partly_cloudy)
print('RMSE midday cloudy = ',rmse_rf_midday_cloudy)
print('RMSE non-midday clear = ',rmse_rf_non_midday_clear)
print('RMSE non-midday partly cloudy = ',rmse_rf_non_midday_partly_cloudy)
print('RMSE non-midday cloudy = ',rmse_rf_non_midday_cloudy)

# overall RMSE, MAE
print('overall RMSE, MAE')
rmse_rf = sqrt(mean_squared_error(rf['I'],rf['I_hat']))
mae_rf = mean_absolute_error(rf['I'],rf['I_hat'])
print()

print('overall RMSE, MAE')
print('RMSE = ',rmse_rf)
print('MAE = ',mae_rf)

# overall RMSE, MAE by hour_clusters
rmse_rf_midday = sqrt(mean_squared_error(rf[rf['hour_clusters'] == 'midday']['I'],rf[rf['hour_clusters'] == 'midday']['I_hat']))
mae_rf_midday = mean_absolute_error(rf[rf['hour_clusters'] == 'midday']['I'],rf[rf['hour_clusters'] == 'midday']['I_hat'])
rmse_rf_non_midday = sqrt(mean_squared_error(rf[rf['hour_clusters'] == 'non-midday']['I'],rf[rf['hour_clusters'] == 'non-midday']['I_hat']))
mae_rf_non_midday = mean_absolute_error(rf[rf['hour_clusters'] == 'non-midday']['I'],rf[rf['hour_clusters'] == 'non-midday']['I_hat'])

print()
print('overall RMSE, MAE by hour_clusters')
print('RMSE midday = ',rmse_rf_midday)
print('MAE midday = ',mae_rf_midday)
print('RMSE non-midday = ',rmse_rf_non_midday)
print('MAE non-midday = ',mae_rf_non_midday)

# overall RMSE, MAE by sky_condition
rmse_rf_clear = sqrt(mean_squared_error(rf[rf['sky_condition'] == 'clear']['I'],rf[rf['sky_condition'] == 'clear']['I_hat']))
mae_rf_clear = mean_absolute_error(rf[rf['sky_condition'] == 'clear']['I'],rf[rf['sky_condition'] == 'clear']['I_hat'])
rmse_rf_partly_cloudy = sqrt(mean_squared_error(rf[rf['sky_condition'] == 'partly_cloudy']['I'],rf[rf['sky_condition'] == 'partly_cloudy']['I_hat']))
mae_rf_partly_cloudy = mean_absolute_error(rf[rf['sky_condition'] == 'partly_cloudy']['I'],rf[rf['sky_condition'] == 'partly_cloudy']['I_hat'])
rmse_rf_cloudy = sqrt(mean_squared_error(rf[rf['sky_condition'] == 'cloudy']['I'],rf[rf['sky_condition'] == 'cloudy']['I_hat']))
mae_rf_cloudy = mean_absolute_error(rf[rf['sky_condition'] == 'cloudy']['I'],rf[rf['sky_condition'] == 'cloudy']['I_hat'])

print()
print('overall RMSE, MAE by sky_condition')
print('RMSE clear = ',rmse_rf_clear)
print('MAE clear = ',mae_rf_clear)
print('RMSE partly cloudy = ',rmse_rf_partly_cloudy)
print('MAE partly cloudy = ',mae_rf_partly_cloudy)
print('RMSE cloudy = ',rmse_rf_cloudy)
print('MAE cloudy = ',mae_rf_cloudy)


RMSE midday clear =  140.5930403542265
RMSE midday partly cloudy =  187.44061885235726
RMSE midday cloudy =  218.19510338454614
RMSE non-midday clear =  78.0701608126315
RMSE non-midday partly cloudy =  95.78043296295964
RMSE non-midday cloudy =  123.57399818590075
overall RMSE, MAE

overall RMSE, MAE
RMSE =  132.6615850307227
MAE =  89.44768513236735

overall RMSE, MAE by hour_clusters
RMSE midday =  158.0017769460379
MAE midday =  111.64470507061544
RMSE non-midday =  85.11819166940485
MAE non-midday =  58.244264621069874

overall RMSE, MAE by sky_condition
RMSE clear =  118.67279651486217
MAE clear =  78.52802998623916
RMSE partly cloudy =  156.02392412256205
MAE partly cloudy =  112.23676464310012
RMSE cloudy =  184.88174444142868
MAE cloudy =  123.75519101100062


## Create table of all performance

In [8]:
# create table of Overall performance
overall = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'RMSE':[rmse_et,rmse_lightgbm,rmse_rf],'MAE':[mae_et,mae_lightgbm,mae_rf]})
overall

Unnamed: 0,Model,RMSE,MAE
0,ExtraTree,134.174067,90.144303
1,LightGBM,132.431468,89.886852
2,RandomForest,132.661585,89.447685


In [9]:
# create table of split by hour_clusters rmse
hour_clusters = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'RMSE midday':[rmse_et_midday,rmse_lightgbm_midday,rmse_rf_midday],'RMSE non-midday':[rmse_et_non_midday,rmse_lightgbm_non_midday,rmse_rf_non_midday]})
hour_clusters

Unnamed: 0,Model,RMSE midday,RMSE non-midday
0,ExtraTree,159.814841,86.058149
1,LightGBM,157.479934,85.614141
2,RandomForest,158.001777,85.118192


In [10]:
# create table of split by hour_clusters mae
hour_clusters = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'MAE midday':[mae_et_midday,mae_lightgbm_midday,mae_rf_midday],'MAE non-midday':[mae_et_non_midday,mae_lightgbm_non_midday,mae_rf_non_midday]})
hour_clusters

Unnamed: 0,Model,MAE midday,MAE non-midday
0,ExtraTree,112.493926,58.726361
1,LightGBM,111.673181,59.260758
2,RandomForest,111.644705,58.244265


In [11]:
# create table of split by sky_condition rmse
sky_condition = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'RMSE clear':[rmse_et_clear,rmse_lightgbm_clear,rmse_rf_clear],'RMSE partly cloudy':[rmse_et_partly_cloudy,rmse_lightgbm_partly_cloudy,rmse_rf_partly_cloudy],'RMSE cloudy':[rmse_et_cloudy,rmse_lightgbm_cloudy,rmse_rf_cloudy]})
sky_condition

Unnamed: 0,Model,RMSE clear,RMSE partly cloudy,RMSE cloudy
0,ExtraTree,120.048729,157.242047,191.021527
1,LightGBM,118.364328,157.291716,173.544606
2,RandomForest,118.672797,156.023924,184.881744


In [12]:
# create table of split by sky_condition mae
sky_condition = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'MAE clear':[mae_et_clear,mae_lightgbm_clear,mae_rf_clear],'MAE partly cloudy':[mae_et_partly_cloudy,mae_lightgbm_partly_cloudy,mae_rf_partly_cloudy],'MAE cloudy':[mae_et_cloudy,mae_lightgbm_cloudy,mae_rf_cloudy]})
sky_condition

Unnamed: 0,Model,MAE clear,MAE partly cloudy,MAE cloudy
0,ExtraTree,79.304834,112.518052,126.514324
1,LightGBM,78.844084,114.035715,114.288226
2,RandomForest,78.52803,112.236765,123.755191


In [13]:
# create table of split by sky_condition and hour_clusters rmse
sky_condition_hour_clusters = pd.DataFrame({'Model':['ExtraTree','LightGBM','RandomForest'],'RMSE midday clear':[rmse_et_midday_clear,rmse_lightgbm_midday_clear,rmse_rf_midday_clear],'RMSE midday partly cloudy':[rmse_et_midday_partly_cloudy,rmse_lightgbm_midday_partly_cloudy,rmse_rf_midday_partly_cloudy],'RMSE midday cloudy':[rmse_et_midday_cloudy,rmse_lightgbm_midday_cloudy,rmse_rf_midday_cloudy],'RMSE morning&evening clear':[rmse_et_non_midday_clear,rmse_lightgbm_non_midday_clear,rmse_rf_non_midday_clear],'RMSE morning&evening partly cloudy':[rmse_et_non_midday_partly_cloudy,rmse_lightgbm_non_midday_partly_cloudy,rmse_rf_non_midday_partly_cloudy],'RMSE morning&evening cloudy':[rmse_et_non_midday_cloudy,rmse_lightgbm_non_midday_cloudy,rmse_rf_non_midday_cloudy]})
sky_condition_hour_clusters

Unnamed: 0,Model,RMSE midday clear,RMSE midday partly cloudy,RMSE midday cloudy,RMSE morning&evening clear,RMSE morning&evening partly cloudy,RMSE morning&evening cloudy
0,ExtraTree,142.255153,188.784951,226.363163,78.894206,96.85514,125.359426
1,LightGBM,139.783867,189.083745,203.828308,78.980695,96.227749,118.419076
2,RandomForest,140.59304,187.440619,218.195103,78.070161,95.780433,123.573998
