In [None]:
%load_ext autoreload
%autoreload 2

# Error of prediction visualisation
In this notebook we plot the mean squared error of the prediction with respect to the observation. 


In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import matplotlib.pyplot as plt
import numpy as np
import os
import pathlib
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import math
import calendar

In [None]:
sns.set_theme()

## Boot cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source $HOME/.bashrc','conda activate smc01'],
    name='smc01-dask',
    cores=8, memory='60GB',
    spill=False, target=False,
    local_directory='/var/tmp/', 
)

In [None]:
cluster.scale(jobs=10)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Read dataset

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT = DATA_DIR / 'interpolated/2021-12-20-test/*.parquet'

In [None]:
#temporary only to remove the columns that dont exist in the 2021 files
df = dd.read_parquet(INPUT)

In [None]:
df = df.persist()

We are calculating the ratio of N/A per column in the dataset. This will allow us to determine if we need to remove N/A values in the dataset.

In [None]:
na_ratios = []
for column in df.columns:
    data = df[column]
    len_full = len(data)
    na_df = data.dropna()
    ratio = (len(data) - len(na_df))/len(data)
    na_ratios.append(ratio)

na_ratio_df = pd.DataFrame(na_ratios, index=df.columns)

We only keep the columns that have an na ratio of above 0 meaning they contain N/A values.

In [None]:
na_ratio_df[na_ratio_df[0] > 0]

As we can see only certain of the observation columns have some N/A values. The column obs_prmsl has the highest ratio of N/A values. But given that all the prediction columns have no N/A which will be the columns used as the feature space of the ML models we will not remove the N/A values. 

## Calculate errors of predictions
Note here that not all prediction varibles contain an observation. So only the following MSE could be calculated for the dataset.
<ul>
    <li>2r: 2 meter relative humidity</li>
    <li>2t: 2 meter temperature</li>
    <li>10wdir: 10 meter wind direction</li>
    <li>10si: 10 wind speed</li>
</ul>

We convert degrees to radians for future calculations of error of wind direction.

In [None]:
df[['obs_10wdir', 'gdps_10wdir']] = df[['obs_10wdir', 'gdps_10wdir']].apply(np.radians, axis=1, meta=df[['obs_10wdir', 'gdps_10wdir']])

In [None]:
def calculate_error(row, variable_suffix):
    return row['gdps_'+ variable_suffix] - row['obs_' + variable_suffix]

Here is a function to calculate the angle error between two angles. We want the smallest angle difference between the two. For example, if we have x=10 and y=350 the error is 20 degrees not 340.

In [None]:
def calculate_angle_error(row, variable_suffix):
    return dd.concat([2*math.pi - calculate_error(row, variable_suffix).abs(), calculate_error(row, variable_suffix).abs()], axis=1, ignore_unknown_divisions=True).min(axis=1)

In [None]:
def generate_mean_squared_error_columns(variable_suffix, df, func=calculate_error):
    df['error_'+ variable_suffix] = func(df, variable_suffix=variable_suffix)
    df['squared_error_'+ variable_suffix] = df['error_'+ variable_suffix] ** 2
    df['rmse_' + variable_suffix] = da.sqrt(df['squared_error_' + variable_suffix])
    df['mabs_' + variable_suffix] = np.abs(df['error_' + variable_suffix])

Here we compute the step value in hours as well as the mean squared error metriques for each pair of obs to prediction column. 

In [None]:
df['step_hour'] = df['step'] / 3600
generate_mean_squared_error_columns('2t', df)
generate_mean_squared_error_columns('2r', df)
generate_mean_squared_error_columns('10wdir', df, calculate_angle_error)
generate_mean_squared_error_columns('10si', df)

## 1.0 Visualisation of mean squared error of prediction vs observation

Below is a function to plot a scatter plot on a map in plotly. 

In [None]:
def plot_map_scatter_plot_with_steps(
        data_frame,step_items, 
        column_to_plot='rmse_2t', 
        title ="Placeholder title", 
        step_column='step_hour', 
        map_type='north america', 
        color_bar_title='Temperature', 
        tick_suffix = '°C', 
        slider_prefix_label='Lead time to true obs: ', 
        slider_suffix_label=' h',
        ticks=1):
    """
        Method that take a datafram containing longitude and latitude information and plots a scatter plot on a map.
        datafame: dataframe with object information 
        column_to_plot: the column from the dataframe corresponding to the item to plot in the scatter plot. 
        title: title of the figure. 
        step_items: a list of values corresponding to steps or categories to group the data in a range. This will determine the range slider displayed. For only one item send a list with one item. 
        step_column: column name corresponding to the step value in the dataframe to group in the range slider. 
        map_type: type of map to be used to display the scatter plot on. The following values are available : 'world', 'usa', 'europe', 'asia', 'africa', 'north america', 'south america'. 
        color_bar_title: side title of the color bar. This corresponds to the values of the column_to_plot.
        tick_suffix: metric of the values of the column_to_plot.
        slider_suffix_label: string value appearing after the slider value selected displayed. 
        slider_prefix_label: string value appearing before the slider value selected displayed. 
        ticks: value determining the number of ticks to display on the color bar. 
    """
    scl = [[0,"rgb(255, 234, 0)"],[0.5,"rgb(255, 111, 0)"],[1,"rgb(255, 0, 0)"]]
    frames = []
    for item in step_items:
        frame = {   
            'name':'frame_{}'.format(item),
            'data':[{
                'type':'scattergeo',
                'lat':data_frame[data_frame[step_column]==item]['latitude'],
                'lon':data_frame[data_frame[step_column]==item]['longitude'],
                'text': data_frame[data_frame[step_column]==item]['station'],
                'marker':dict(
                    color = data_frame[data_frame[step_column]==item][column_to_plot],
                    reversescale = False,
                    opacity = 1,
                    size = 3,
                    cmax=data_frame[column_to_plot].max(),
                    cmin=data_frame[column_to_plot].min(),
                    colorscale = scl,
                    colorbar = dict(
                        titleside = "right",
                        outlinecolor = "rgba(68, 68, 68, 0)",
                        ticks = "outside",
                        tick0=0,
                        showticksuffix = "all",
                        dtick = ticks,
                        ticksuffix=tick_suffix,
                        title=color_bar_title
                    )),
                }],          
            }
        frames.append(frame)
    frames = [{   
    'name':'frame_{}'.format(item),
    'data':[{
        'type':'scattergeo',
        'lat':data_frame[data_frame[step_column]==item]['latitude'],
        'lon':data_frame[data_frame[step_column]==item]['longitude'],
        'text': data_frame[data_frame[step_column]==item]['station'],
        'marker':dict(
            color = data_frame[data_frame[step_column]==item][column_to_plot],
            reversescale = False,
            opacity = 1,
            size = 3,
            cmax=data_frame[column_to_plot].max(),
            cmin=data_frame[column_to_plot].min(),
            colorscale = scl,
            colorbar = dict(
                titleside = "right",
                outlinecolor = "rgba(68, 68, 68, 0)",
                ticks = "outside",
                tick0=0,
                showticksuffix = "all",
                dtick = ticks,
                ticksuffix=tick_suffix,
                title=color_bar_title
            )),
        }],          
    } for item in step_items]  
    
    sliders = [{
    'transition':{'duration': 0},
    'x':0.08, 
    'len':0.88,
    'currentvalue':{'font':{'size':15}, 'prefix':slider_prefix_label,'suffix':slider_suffix_label, 'visible':True, 'xanchor':'center'},  
    'steps':[
        {
            'label':item,
            'method':'animate',
            'args':[
                ['frame_{}'.format(item)],
                {'mode':'immediate', 'frame':{'duration':100, 'redraw': True}, 'transition':{'duration':50}}
              ],
        } for item in step_items]
    }]
    
    layout = go.Layout(
        sliders=sliders,
        geo = dict(
            scope = map_type,
            showland = True,
            landcolor = "rgb(212, 212, 212)",
            subunitcolor = "rgb(255, 255, 255)",
            countrycolor = "rgb(255, 255, 255)",
            showlakes = True,
            lakecolor = "rgb(255, 255, 255)",
            showsubunits = True,
            showcountries = True,
            resolution = 110,
            projection = dict(
                type = 'conic equal area'
            ),
            lonaxis = dict(
                showgrid = False,
                gridwidth = 0.5,
                range= [ data_frame['longitude'].min(), data_frame['longitude'].max() ],
                dtick = 5
            ),
            lataxis = dict (
                showgrid = False,
                gridwidth = 0.5,
                range= [data_frame['latitude'].min(), data_frame['latitude'].max() ],
                dtick = 5
            )
        ),
        width=1250,
        height=800,
        title=title,
        )

    # Creating the figure
    data = frames[0]['data']
    fig = go.Figure(data=data, layout=layout, frames=frames)

    # Displaying the figure
    fig.show()

### 1.1 Visualisation of rmse per time step
In this section we want to visualise the rmse of predictions vs observations per time step. The time step corresponds to the time prediction prior the true value. So if 0.0 this is the prediction at time 0.0, 6.0 is the prediction 6h in advance of the true observation. We will also group the observation and predictions per station so we only have one rmse per station per time step. 

In [None]:
steps = df['step_hour'].unique().compute().sort_values()
step_hour_group_by = df.groupby(['step_hour', 'station']).mean().compute()
step_hour_group_by = step_hour_group_by.reset_index()

#### 1.1.1 MSE 2 meter temperate per time step
We can observe that as we increase the time of prediction prior to the true observation the error increases, meaning that predicting the temperature at 2 meters above ground gets harder the more in advance you make the prediction with a low of 0.57°C and high of 8.58°C. Also the increase in error is much more visible in the mainland USA and canadian stations. Furthermore, we can observe that the stations in the mainland ouest coast seemns to have very high error constantly as we increase the time of prediction. 

In [None]:
min_2t = step_hour_group_by['rmse_2t'].min()
max_2t = step_hour_group_by["rmse_2t"].max()
print("min 2t: {0} and max 2t: {1}".format(round(min_2t,2), round(max_2t,2)))

In [None]:
plot_map_scatter_plot_with_steps(step_hour_group_by, steps, 'rmse_2t', '2 meter temperature RMSE per time step', 'step_hour', ticks=2)

#### 1.1.2 MSE 10 meter wind direction per time step

We can observe that has we increase the prediction time of the 10 meter wind direction the error increases with a low of 0.25° and high of 2.04°. Furthermore, we observe that the main error in prediction is concentrated in the coastal stations even at prediction 0, it increases slightly as we increase the time of prediction but remains high across the coastal stations. 

In [None]:
min_10wdir = step_hour_group_by['rmse_10wdir'].min()
max_10wdir = step_hour_group_by["rmse_10wdir"].max()
print("min 10wdir: {0} and max 10wdir: {1}".format(round(min_10wdir,2), round(max_10wdir,2)))

In [None]:
plot_map_scatter_plot_with_steps(step_hour_group_by,steps,'rmse_10wdir', '10 meter wind direction RMSE per time step', 'step_hour',color_bar_title='wind direction', tick_suffix = ' rad',  ticks=0.5)

#### 1.1.3 MSE 10 meter wind speed per time step

We can observe that the prediction of the 10 meter wind speed is not very much affected by the prediction time with a low of 0.58 m·s⁻¹ and high of 15.18 m·s⁻¹. As we increase the prediction time the error in prediction increases slightly but not significantly.

In [None]:
min_10si = step_hour_group_by['rmse_10si'].min()
max_10si = step_hour_group_by["rmse_10si"].max()
print("min 10si: {0} and max 10si: {1}".format(round(min_10si,2), round(max_10si,2)))

In [None]:
plot_map_scatter_plot_with_steps(step_hour_group_by,steps, 'rmse_10si', '10 meter wind speed RMSE per time step', 'step_hour',color_bar_title='wind speed', tick_suffix = ' m·s⁻¹', ticks=1)

We notice that as we slide the error is concentrated in the station MWN which stays constant around 15m/s. The removal of this stations will be tested to see the more global effect on the other station. 

In [None]:
indexNames = step_hour_group_by[step_hour_group_by['station']=='MWN'].index
step_hour_group_by_no_mwn = step_hour_group_by.drop(indexNames)

In [None]:
plot_map_scatter_plot_with_steps(step_hour_group_by_no_mwn,steps, 'rmse_10si', '10 meter wind speed RMSE per time step - No MWN station', 'step_hour',color_bar_title='wind speed', tick_suffix = ' m·s⁻¹', ticks=1)

We can observe that the prediction of the 10 meter wind speed is not very much affected by the prediction time with a low of 0.58 m·s⁻¹ and high of 4.98 m·s⁻¹. Which is much smaller than a high of 15.18. As we increase the prediction time the error in prediction increases slightly but not significantly.

#### 1.1.4 MSE 2 meter relative humidity per time step

We can observe that increasing the time of prediction prior to the true observation increases the error in prediction with a low of 3.18% and high of 23.14% eror in 2 meter relative humidity. It is realively constants across the whole of the stations. The only slight high error rate that stays relatively constant can be observed in the stations concentrated in the ouest cost of US and BC area.

In [None]:
min_2r = step_hour_group_by['rmse_2r'].min()
max_2r = step_hour_group_by["rmse_2r"].max()
print("min 2r: {0} and max 2r: {1}".format(round(min_2r,2), round(max_2r,2)))

In [None]:
plot_map_scatter_plot_with_steps(step_hour_group_by, steps, 'rmse_2r', '2 meter relative humidity RMSE per time step', 'step_hour',color_bar_title='relative humidity', tick_suffix = ' %', ticks=3)

### 1.2 Visualisation of mse per month
In this section we want to visualize the rmse of each station per month. We will therefore group by month and station and mean the rmse error before plotting on a geo scatter plot. We want to visualise if any of the specific measurements increase in specifique month i.e if there is a seasonality involved in the error of prediction. 

In [None]:
df['month'] = df['date'].dt.month

In [None]:
months = df['month'].unique().compute().sort_values().apply(lambda x: calendar.month_name[x])
month_group_by = df.groupby(['month', 'station']).mean().compute()
month_group_by = month_group_by.reset_index()
month_group_by['month'] = month_group_by['month'].apply(lambda x: calendar.month_name[x])

#### 1.2.1 RMSE 2 meter temperature per month

We can observe that the error is higher during the autumn and winter months than during spring and summer. Canadian and mainland ouest US stations seems to have the highest increase in error in the winter months compared to the other stations. With the high and low reported to the 3.81°C and 31.8°C. 

In [None]:
min_2r = month_group_by['rmse_2r'].min()
max_2r = month_group_by["rmse_2r"].max()
print("min 2r: {0} and max 2r: {1}".format(round(min_2r,2), round(max_2r,2)))

In [None]:
plot_map_scatter_plot_with_steps(month_group_by, months, 'rmse_2t', '2 meter temperature RMSE per time step', 'month', slider_prefix_label='month of prediction: ', slider_suffix_label='', ticks=2)

#### 1.2.2 MSE 10 meter wind direction per month

We can observe that there does not seem to be a seasonalit of the error in the 10 meter wind direction. Overall, the error does not vary a lot from one month to the next and is realively constant between stations. Raging from a low of 0.24° to 2 in error°. The slight variation from stations in the error seems to be concentrated with a higher proportion of the error on the coastal stations in the ouest coast. 

In [None]:
min_10wdir = month_group_by['rmse_10wdir'].min()
max_10wdir = month_group_by["rmse_10wdir"].max()
print("min 10wdir {0} and max 10wdir: {1}".format(round(min_10wdir,2), round(max_10wdir)))

In [None]:
plot_map_scatter_plot_with_steps(month_group_by,months,'rmse_10wdir', '10 meter wind direction RMSE per time step', 'month',color_bar_title='wind direction', tick_suffix = 'rad', slider_prefix_label='month of prediction: ', slider_suffix_label='',  ticks=0.5)

#### 1.2.3 MSE 10 meter wind speed per month

We can observe that there is no seasonality in the prediction of 10 meter wind speed. The error is constant across all months. Except for one station which has high error across all months which is MWN station. 

In [None]:
min_10si = month_group_by['rmse_10si'].min()
max_10si = month_group_by["rmse_10si"].max()
print("min 10si {0} and max 10si: {1}".format(round(min_10si,2), round(max_10si)))

In [None]:
plot_map_scatter_plot_with_steps(month_group_by,months, 'rmse_10si', '10 meter wind speed RMSE per time step', 'month',color_bar_title='wind speed', tick_suffix = ' m·s⁻¹', slider_prefix_label='month of prediction: ', slider_suffix_label='', ticks=1)

Given this high error across all MWN stations we will remove this station to visualise the other stations error rate more granuraly. 

In [None]:
indexNames = month_group_by[month_group_by['station']=='MWN'].index
month_group_by_no_mwn = month_group_by.drop(indexNames)

In [None]:
plot_map_scatter_plot_with_steps(month_group_by_no_mwn,months, 'rmse_10si', '10 meter wind speed RMSE per time step - No MWN station', 'month',color_bar_title='wind speed', tick_suffix = ' m·s⁻¹', slider_prefix_label='month of prediction: ', slider_suffix_label='', ticks=1)

We notice that the error does not change drastically or actually very much from one month to the next but with a much smaller high of error of 8 m·s⁻¹ compared to 20 m·s⁻¹. 

#### 1.2.4 MSE 2 meter relative humidity per time step

We can observe that the sounth of the USA has high error in the prediction of 2 meter relative humidity across all months of the year. We can observe some seasonality in the spring and summer months with an increase in the overall stations. The overall range in error for 2 meter relative humidity is 3.81% and 32%. 

In [None]:
min_2r = month_group_by['rmse_2r'].min()
max_2r = month_group_by["rmse_2r"].max()
print("min 2r {0} and max 2r: {1}".format(round(min_2r,2), round(max_2r)))

In [None]:
plot_map_scatter_plot_with_steps(month_group_by, months, 'rmse_2r', '2 meter relative humidity RMSE per time step', 'month',color_bar_title='relative humidity', tick_suffix = ' %', slider_prefix_label='month of prediction: ', slider_suffix_label='',ticks=3)