Use the test fetched data to compute the model error at one station.
Target period is 2020-11-26-12 to 2020-11-27-12.

In [None]:
GDPS_DIR = '/home/ubuntu/data/test_download/gdps'
RDPS_DIR = '/home/ubuntu/data/test_download/rdps'
SWOB_DIR = '/home/ubuntu/data/test_download/swob'

In [None]:
import datetime
import dask
import dask.bag as db
import dask.dataframe as dd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import scipy.interpolate
import xarray as xr
import xml.dom.minidom
import seaborn as sns
import pytz

# Read obs

In [None]:
def string_to_dict(obs_xml_string):
    obs_data = xml.dom.minidom.parseString(obs_xml_string)
    metadata = obs_data.getElementsByTagName('identification-elements')[0]

    metadata_dict = {}

    for element in metadata.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        metadata_dict[variable] = value
        
    obs_dict = {}

    elements = obs_data.getElementsByTagName('elements')[0]
    for element in elements.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        obs_dict[variable] = value
        
    return {**metadata_dict, **obs_dict}

In [None]:
swob_dir = pathlib.Path(SWOB_DIR)

In [None]:
obs_files = [
    *list(swob_dir.glob('20201126/CYUL/*.xml')),
    *list(swob_dir.glob('20201127/CYUL/*.xml')),
    *list(swob_dir.glob('20201128/CYUL/*.xml')),
]

In [None]:
bag = db.read_text(obs_files)
obs_dicts = bag.map(string_to_dict)
obs_df = obs_dicts.to_dataframe()

In [None]:
obs_df

In [None]:
computed = obs_df.compute()

In [None]:
computed

In [None]:
for col in ['lat', 'long', 'air_temp']:  
    computed[col] = pd.to_numeric(computed[col])
    
computed['date_tm'] = pd.to_datetime(computed['date_tm'])

In [None]:
tt_obs = computed.sort_values('date_tm')

In [None]:
tt_obs[['date_tm', 'lat', 'long', 'air_temp']]

In [None]:
sns.lineplot(data=tt_obs, x='date_tm', y='air_temp')

# Read grib

In [None]:
gdps_path = pathlib.Path(GDPS_DIR)
temperature_files = [
    *[gdps_path / '2020112600' / 'CMC_glb_TMP_TGL_2_latlon.15x.15_2020112600_P{:03}.grib2'.format(t) for t in range(12, 37, 3)],
    #*[gdps_path / '2020112612' / 'CMC_glb_TMP_TGL_2_latlon.15x.15_2020112612_P{:03}.grib2'.format(t) for t in range(0, 25, 3)]
]

In [None]:
temperature_files = sorted(temperature_files)

In [None]:
temperature_files

In [None]:
tt = xr.open_mfdataset(temperature_files, engine='cfgrib', concat_dim='step', combine='nested', parallel=True)

In [None]:
tt['step']

# Compute error

In [None]:
tt = tt.sortby('step')

In [None]:
tt = tt.set_index(step='valid_time')

In [None]:
tt

In [None]:
tt.step[0].item()

In [None]:
tt.t2m.sel(step=tt.step[0]).plot()

In [None]:
tt

In [None]:
tt.step

In [None]:
dates[0]

In [None]:
tt_obs['date_tm']

In [None]:
pd.Timestamp(2020,11,26,12).tz_localize('UTC')

In [None]:
tt_in_range = tt_obs[
    (tt_obs['date_tm'] > pd.Timestamp(2020,11,26,12).tz_localize('UTC')) &
    (tt_obs['date_tm'] <= pd.Timestamp(2020,11,27,12).tz_localize('UTC'))]

In [None]:
tt_in_range

In [None]:
dates = [x.to_pydatetime().replace(tzinfo=None) for x in tt_in_range['date_tm']]

In [None]:
dates

In [None]:
tt_in_range['lat'].values[0]

In [None]:
model_tt = tt.interp(latitude=tt_in_range['lat'].values[0], longitude=tt_in_range['long'].values[0], step=dates)

In [None]:
model_tt

In [None]:
tt_in_range['air_temp']

In [None]:
mse = np.sqrt(np.square((model_tt - 273.15) - tt_in_range['air_temp'])).mean()

In [None]:
mse

In [None]:
tt_in_range['air_temp']

In [None]:
fig, ax = plt.subplots()
ax.plot(tt_in_range['date_tm'], tt_in_range['air_temp'], label='obs')
ax.plot(model_tt.step, model_tt.t2m - 273.15, label='gdps 2020112600')
ax.legend()

# RDPS

In [None]:
rdps_path = pathlib.Path(RDPS_DIR)
temperature_files = [
    *[rdps_path / '2020112600' / 'CMC_reg_TMP_TGL_2_ps10km_2020112600_P{:03}.grib2'.format(t) for t in range(12, 37, 1)],
]

In [None]:
temperature_files

In [None]:
tt_rdps = xr.open_mfdataset(temperature_files, engine='cfgrib', concat_dim='step', combine='nested', parallel=True)

In [None]:
tt_rdps

In [None]:
tt_rdps = tt_rdps.sortby('step')

In [None]:
tt_rdps = tt_rdps.set_index(step='valid_time')

In [None]:
tt_rdps

In [None]:
tt_rdps.latitude.compute()

In [None]:
tt_rdps.t2m.values.shape

In [None]:
f = scipy.interpolate.interp2d(tt_rdps.latitude.values, tt_rdps.longitude.values, tt_rdps.t2m.values[0], copy=False)

In [None]:
lat, lon = tt_obs['lat'].values[0], tt_obs['long'].values[0]

In [None]:
lats, lons = tt_rdps.latitude.values, tt_rdps.longitude.values

In [None]:
np.unravel_index(np.argmin(np.sqrt(np.square(lats - lat) + np.square(lons - lon))), lats.shape)

In [None]:
lats[823, 693]

In [None]:
lons[823, 693]

In [None]:
plt.imshow(lats - lat)

In [None]:
plt.imshow(np.square(lons - lon))

In [None]:
lons

In [None]:
np.max(lons)