In [33]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### Data detail

$ I_{vsc} $ (1 hour): vc_weather_data.csv
March 9, 2023 - February 29, 2024

$ I_{ncep} $ (15 minutes) siteCU03_xxx.csv
January 1, 2023 - January 1, 2024

$ I_{actual} $ (1 minute) EE_station 1_xxx.csv
January 1, 2023 - December 31, 2023

$ PV_{actual} $ (5 minutes, 15 minutes)  pv_data_5min_xxx.csv
March 9, 2023 - February 29, 2024



In [34]:
# Verify VC and Actual

# Load VSC irradiance data
df_vc_weather = pd.read_csv('vc_weather_data.csv', usecols=['datetime', 'solarradiation', 'solarenergy', 'uvindex'],
                         parse_dates=['datetime'])
df_vc_weather['datetime'] = pd.to_datetime(df_vc_weather['datetime'], format='%Y-%m-%d %H:%M:%S')

# Filter irradiance_df based on the time range between 7 am and 5 pm
start_time = pd.to_datetime('07:00:00').time()
end_time = pd.to_datetime('17:00:00').time()

df_vc_weather = df_vc_weather[(df_vc_weather['datetime'].dt.time >= start_time) & (df_vc_weather['datetime'].dt.time <= end_time)]
df_vc_weather 

Unnamed: 0,datetime,solarradiation,solarenergy,uvindex
7,2023-01-01 07:00:00,11,0.0,0
8,2023-01-01 08:00:00,59,0.2,1
9,2023-01-01 09:00:00,175,0.6,2
10,2023-01-01 10:00:00,244,0.9,2
11,2023-01-01 11:00:00,317,1.1,3
...,...,...,...,...
8749,2023-12-31 13:00:00,505,1.8,5
8750,2023-12-31 14:00:00,305,1.1,3
8751,2023-12-31 15:00:00,208,0.7,2
8752,2023-12-31 16:00:00,68,0.2,1


In [35]:
# Load actual irradiance data
irradiance_df = pd.read_csv('EE Station 1-20230101-20231231.csv', usecols=['Datetime', 'Irradiance_30 (W/m2)'], parse_dates=['Datetime'])
irradiance_df = irradiance_df.rename(columns={'Datetime': 'datetime', 'Irradiance_30 (W/m2)': 'irradiance'})
irradiance_df = irradiance_df[(irradiance_df['datetime'].dt.time >= start_time) & (irradiance_df['datetime'].dt.time <= end_time)]
irradiance_df

Unnamed: 0,datetime,irradiance
420,2023-01-01 07:00:00,33.0
421,2023-01-01 07:01:00,33.0
422,2023-01-01 07:02:00,31.0
423,2023-01-01 07:03:00,35.0
424,2023-01-01 07:04:00,38.0
...,...,...
524907,2023-12-31 16:56:00,84.0
524908,2023-12-31 16:57:00,82.0
524909,2023-12-31 16:58:00,80.0
524910,2023-12-31 16:59:00,86.0


In [36]:
# merge the two dataframes
df = pd.merge(df_vc_weather, irradiance_df, on='datetime', how='left')
df = df[['datetime', 'solarradiation', 'irradiance']]
df = df.rename(columns={'solarradiation': 'VC irradiance', 'irradiance': 'Actual irradiance'})
df = df.dropna()
df

Unnamed: 0,datetime,VC irradiance,Actual irradiance
0,2023-01-01 07:00:00,11,33.0
1,2023-01-01 08:00:00,59,219.0
2,2023-01-01 09:00:00,175,441.0
3,2023-01-01 10:00:00,244,603.0
4,2023-01-01 11:00:00,317,735.0
...,...,...,...
4010,2023-12-31 13:00:00,505,816.0
4011,2023-12-31 14:00:00,305,552.0
4012,2023-12-31 15:00:00,208,538.0
4013,2023-12-31 16:00:00,68,337.0


In [37]:
df = df.rename(columns={'irradiance': 'Actual irradiance'})

fig = px.scatter(df, 
                 x='Actual irradiance', 
                 y='VC irradiance', 
                 title='Scatter Plot of <b> Visual Crossing </b> vs <b> Actual </b> irradiance')
# fig size square
fig.update_traces(marker=dict(size=5))
fig.update_layout(font=dict(size=24),width=1200, height=1200)
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    yaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    )
)

In [38]:
# plot R-squared value
# plot linear regression line
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

r2 = r2_score(df['VC irradiance'], df['Actual irradiance'])
r2

fig.add_annotation(
    x=0.5,
    y=0.9,
    xref='paper',
    yref='paper',
    text=f'R-squared: {r2:.2f}',
    showarrow=False,
    font=dict(size=30),
    xshift=300,
    yshift=100
)


X = df['Actual irradiance'].values.reshape(-1, 1)
y = df['VC irradiance'].values

model = LinearRegression()
model.fit(X, y)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig.add_trace(go.Scatter(x=x_range, y=y_range, mode='lines', 
                         name='Regression line'))

fig.update_layout(font=dict(size=18), title_font=dict(size=30))

fig.show()
# fig.write_html('VC_vs_Actual_irradiance.html')
# fig.write_image('Ivsc_Iactual.pdf')

In [39]:
df_evaluated_vsc = df.copy()

# Calculate the difference between the VC and actual irradiance
df_evaluated_vsc['error'] = abs(df_evaluated_vsc['VC irradiance'] - df_evaluated_vsc['Actual irradiance'])
df_evaluated_vsc['hour'] = df_evaluated_vsc['datetime'].dt.hour
df_evaluated_vsc = df_evaluated_vsc.groupby('hour').mean().reset_index()
df_evaluated_vsc = df_evaluated_vsc.round(2)
df_evaluated_vsc


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,hour,VC irradiance,Actual irradiance,error
0,7,36.02,78.18,42.61
1,8,130.05,232.57,110.94
2,9,286.95,405.58,163.99
3,10,456.19,562.6,192.29
4,11,543.76,624.69,189.76
5,12,531.06,676.05,228.91
6,13,495.42,618.81,242.36
7,14,458.17,550.39,216.6
8,15,345.08,436.16,183.5
9,16,234.56,288.55,134.93


# Incep

In [40]:
def read_ncep_data(filename):
    _df = pd.read_csv(filename,sep=";", skiprows=26, names=col_names)
    _df['Time'] = _df['Time'].replace("24:00", "00:00")

    # Concatenate Date and Time columns
    _df['Datetime'] = _df['Date'] + " " + _df['Time']

    # Convert to datetime type
    _df['Datetime'] = pd.to_datetime(_df['Datetime'])

    # Identify rows where 'Time' is "00:00" and adjust the 'Date' by adding 1 day

    next_day_mask = _df['Time'] == "00:00"
    _df.loc[next_day_mask, 'Datetime'] += pd.DateOffset(days=1)
    _df['Datetime'] = _df['Datetime'] + pd.DateOffset(hours=7)

    dropped_cols = ['Date', 'Time']
    _df.drop(columns=dropped_cols, inplace=True)
    _df.set_index('Datetime', inplace=True)
    return _df

# read NCEP data
col_names = ['Date', 'Time','Tncep', 'RH', 'Pressure', 'WS', 'WD', 'rain_fall', 'snow_fall', 'snow_depth', 'Incep']
ncep_df = read_ncep_data("siteCU03_2023-01-01to2023-12-31.csv")
ncep_df['Tncep'] = ncep_df['Tncep']-273.5
ncep_df['Incep'] = ncep_df['Incep'] * 4
ncep_df

Unnamed: 0_level_0,Tncep,RH,Pressure,WS,WD,rain_fall,snow_fall,snow_depth,Incep
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-01 07:15:00,20.91,59.87,1015.56,1.21,348.48,0.0,0.0,0.0,49.2504
2023-01-01 07:30:00,21.26,58.82,1015.68,1.32,352.34,0.0,0.0,0.0,89.4540
2023-01-01 07:45:00,21.60,57.78,1015.81,1.42,355.63,0.0,0.0,0.0,135.5100
2023-01-01 08:00:00,21.95,56.73,1015.94,1.53,358.45,0.0,0.0,0.0,187.0680
2023-01-01 08:15:00,22.29,55.68,1016.07,1.65,0.89,0.0,0.0,0.0,243.7264
...,...,...,...,...,...,...,...,...,...
2024-01-01 06:00:00,25.09,60.64,1010.77,2.92,332.69,0.0,0.0,0.0,0.0000
2024-01-01 06:15:00,25.04,61.02,1010.85,2.90,333.69,0.0,0.0,0.0,0.0000
2024-01-01 06:30:00,24.98,61.40,1010.93,2.87,334.71,0.0,0.0,0.0,0.0000
2024-01-01 06:45:00,24.92,61.78,1011.01,2.85,335.75,0.0,0.0,0.0,0.0028


In [41]:
ncep_df = ncep_df.reset_index()
ncep_df = ncep_df.rename(columns={'Datetime': 'datetime'})

# Load actual irradiance data
irradiance_df = pd.read_csv('EE Station 1-20230101-20231231.csv', usecols=['Datetime', 'Irradiance_30 (W/m2)'], parse_dates=['Datetime'])
irradiance_df = irradiance_df.rename(columns={'Datetime': 'datetime', 'Irradiance_30 (W/m2)': 'irradiance'})
irradiance_df = irradiance_df[(irradiance_df['datetime'].dt.time >= start_time) & (irradiance_df['datetime'].dt.time <= end_time)]
df = pd.merge(ncep_df, irradiance_df, on='datetime', how='left')

# select rows only when minutes is 00
df = df[df['datetime'].dt.minute == 0]
df = df.dropna()
df

Unnamed: 0,datetime,Tncep,RH,Pressure,WS,WD,rain_fall,snow_fall,snow_depth,Incep,irradiance
3,2023-01-01 08:00:00,21.95,56.73,1015.94,1.53,358.45,0.0,0.0,0.0,187.0680,219.0
7,2023-01-01 09:00:00,23.33,52.54,1016.46,2.00,6.49,0.0,0.0,0.0,408.9188,441.0
11,2023-01-01 10:00:00,24.71,48.35,1016.97,2.50,11.42,0.0,0.0,0.0,587.2296,603.0
15,2023-01-01 11:00:00,25.93,44.99,1016.07,2.41,15.85,0.0,0.0,0.0,733.7412,735.0
19,2023-01-01 12:00:00,27.15,41.63,1015.16,2.34,20.57,0.0,0.0,0.0,819.0644,786.0
...,...,...,...,...,...,...,...,...,...,...,...
34967,2023-12-31 13:00:00,33.16,38.72,1010.64,2.36,352.17,0.0,0.0,0.0,810.1496,816.0
34971,2023-12-31 14:00:00,33.30,37.79,1009.95,2.30,348.55,0.0,0.0,0.0,741.3612,552.0
34975,2023-12-31 15:00:00,33.44,36.85,1009.27,2.25,344.75,0.0,0.0,0.0,604.1816,538.0
34979,2023-12-31 16:00:00,33.57,35.91,1008.59,2.21,340.79,0.0,0.0,0.0,412.1208,337.0


In [42]:
df = df.rename(columns={'irradiance': 'Actual irradiance'})

fig = px.scatter(df, 
                 x='Actual irradiance', 
                 y='Incep', 
                 title='Scatter Plot of <b> Incep </b> vs <b> Actual </b> irradiance')
# fig size square
fig.update_traces(marker=dict(size=5))
fig.update_layout(font=dict(size=24),width=1200, height=1200)
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    yaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    )
)

In [43]:
df

Unnamed: 0,datetime,Tncep,RH,Pressure,WS,WD,rain_fall,snow_fall,snow_depth,Incep,Actual irradiance
3,2023-01-01 08:00:00,21.95,56.73,1015.94,1.53,358.45,0.0,0.0,0.0,187.0680,219.0
7,2023-01-01 09:00:00,23.33,52.54,1016.46,2.00,6.49,0.0,0.0,0.0,408.9188,441.0
11,2023-01-01 10:00:00,24.71,48.35,1016.97,2.50,11.42,0.0,0.0,0.0,587.2296,603.0
15,2023-01-01 11:00:00,25.93,44.99,1016.07,2.41,15.85,0.0,0.0,0.0,733.7412,735.0
19,2023-01-01 12:00:00,27.15,41.63,1015.16,2.34,20.57,0.0,0.0,0.0,819.0644,786.0
...,...,...,...,...,...,...,...,...,...,...,...
34967,2023-12-31 13:00:00,33.16,38.72,1010.64,2.36,352.17,0.0,0.0,0.0,810.1496,816.0
34971,2023-12-31 14:00:00,33.30,37.79,1009.95,2.30,348.55,0.0,0.0,0.0,741.3612,552.0
34975,2023-12-31 15:00:00,33.44,36.85,1009.27,2.25,344.75,0.0,0.0,0.0,604.1816,538.0
34979,2023-12-31 16:00:00,33.57,35.91,1008.59,2.21,340.79,0.0,0.0,0.0,412.1208,337.0


In [44]:
# plot R-squared value
# plot linear regression line
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

r2 = r2_score(df['Incep'], df['Actual irradiance'])
r2

fig.add_annotation(
    x=0.5,
    y=0.9,
    xref='paper',
    yref='paper',
    text=f'R-squared: {r2:.2f}',
    showarrow=False,
    font=dict(size=30),
    # move the annotation to the top left
    xshift=300,
    yshift=100
)


X = df['Incep'].values.reshape(-1, 1)
y = df['Actual irradiance'].values

model = LinearRegression()
model.fit(X, y)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig.add_trace(go.Scatter(x=x_range, y=y_range, mode='lines', 
                         name='Regression line'))

fig.update_layout(font=dict(size=18), title_font=dict(size=30))

fig.show()
# fig.write_html('NCEP_vs_Actual_irradiance.html')
# fig.write_image('Incep_Iactual.pdf')

In [45]:
df_evaluated = df.copy()

In [46]:
df_evaluated['error'] = abs(df_evaluated['Incep'] - df_evaluated['Actual irradiance'])
df_evaluated['hour'] = df_evaluated['datetime'].dt.hour
df_evaluated = df_evaluated.groupby('hour').mean().reset_index()
df_evaluated = df_evaluated.round(2)
df_evaluated


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,hour,Tncep,RH,Pressure,WS,WD,rain_fall,snow_fall,snow_depth,Incep,Actual irradiance,error
0,7,24.78,72.99,1009.79,2.32,214.65,0.02,0.0,0.0,71.17,78.32,25.68
1,8,26.41,67.61,1010.19,2.47,199.2,0.03,0.0,0.0,243.98,232.57,81.37
2,9,28.05,62.22,1010.6,2.71,186.19,0.03,0.0,0.0,436.31,405.58,120.19
3,10,29.68,56.83,1011.02,3.0,180.54,0.03,0.0,0.0,591.18,562.6,159.57
4,11,30.53,53.96,1010.22,3.05,169.5,0.03,0.0,0.0,714.06,624.69,197.27
5,12,31.38,51.04,1009.41,3.16,163.53,0.03,0.0,0.0,771.53,676.05,217.29
6,13,32.24,48.11,1008.59,3.33,157.55,0.03,0.0,0.0,738.71,618.81,229.53
7,14,32.14,48.49,1007.87,3.25,156.26,0.04,0.0,0.0,657.53,550.39,224.47
8,15,32.05,48.89,1007.15,3.28,158.09,0.04,0.0,0.0,527.77,436.16,166.91
9,16,31.95,49.31,1006.42,3.4,160.31,0.05,0.0,0.0,363.59,288.55,133.52


In [47]:
df = pd.merge(df_evaluated_vsc, df_evaluated, on='hour', how='left')
df

Unnamed: 0,hour,VC irradiance,Actual irradiance_x,error_x,Tncep,RH,Pressure,WS,WD,rain_fall,snow_fall,snow_depth,Incep,Actual irradiance_y,error_y
0,7,36.02,78.18,42.61,24.78,72.99,1009.79,2.32,214.65,0.02,0.0,0.0,71.17,78.32,25.68
1,8,130.05,232.57,110.94,26.41,67.61,1010.19,2.47,199.2,0.03,0.0,0.0,243.98,232.57,81.37
2,9,286.95,405.58,163.99,28.05,62.22,1010.6,2.71,186.19,0.03,0.0,0.0,436.31,405.58,120.19
3,10,456.19,562.6,192.29,29.68,56.83,1011.02,3.0,180.54,0.03,0.0,0.0,591.18,562.6,159.57
4,11,543.76,624.69,189.76,30.53,53.96,1010.22,3.05,169.5,0.03,0.0,0.0,714.06,624.69,197.27
5,12,531.06,676.05,228.91,31.38,51.04,1009.41,3.16,163.53,0.03,0.0,0.0,771.53,676.05,217.29
6,13,495.42,618.81,242.36,32.24,48.11,1008.59,3.33,157.55,0.03,0.0,0.0,738.71,618.81,229.53
7,14,458.17,550.39,216.6,32.14,48.49,1007.87,3.25,156.26,0.04,0.0,0.0,657.53,550.39,224.47
8,15,345.08,436.16,183.5,32.05,48.89,1007.15,3.28,158.09,0.04,0.0,0.0,527.77,436.16,166.91
9,16,234.56,288.55,134.93,31.95,49.31,1006.42,3.4,160.31,0.05,0.0,0.0,363.59,288.55,133.52


In [48]:
fig = px.bar(df, x='hour', 
             y=['error_x', 'error_y'], 
             title='Mean Absolute Error (MAE) of Irradiance from Visual Crossing and NCEP by Hour', 
             barmode='group')
# rename legend
fig.for_each_trace(lambda t: t.update(name=t.name.replace('error_x', 'VSC').replace('error_y', 'NCEP')))
fig.update_xaxes(tickvals=df_evaluated['hour'])
fig.update_layout(yaxis_title='MAE (W/sqm)', xaxis_title='Hour', 
                  legend_title='Data Source', font=dict(size=18),
                  width=1200, height=600)
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    yaxis=dict(
        linecolor='black',
        showgrid=True,
        gridcolor='lightgrey'
    )
)

fig.show()
fig.write_image('MAE_bar_Ivsc_Incep.pdf')