In [None]:
import dask
import dask.bag as db
import numpy as np
import pandas as pd
import os
import pathlib
import xarray as xr
import xml
import seaborn as sns
import pyinterp
import pyinterp.backends.xarray
import pynanoflann

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
SWOB_DIR = DATA_DIR / '2021-01-05-datamart-sample/swob/'
RDPS_DIR = DATA_DIR / '2021-01-05-datamart-sample/rdps/'

# Read OBS

In [None]:
def string_to_dict(obs_xml_string):
    obs_data = xml.dom.minidom.parseString(obs_xml_string)
    metadata = obs_data.getElementsByTagName('identification-elements')[0]

    metadata_dict = {}

    for element in metadata.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        metadata_dict[variable] = value
        
    obs_dict = {}

    elements = obs_data.getElementsByTagName('elements')[0]
    for element in elements.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        obs_dict[variable] = value
        
    return {**metadata_dict, **obs_dict}

In [None]:
swob_dir = pathlib.Path(SWOB_DIR)

In [None]:
obs_files = [
    *list(swob_dir.glob('20201126/CYUL/*.xml')),
    *list(swob_dir.glob('20201127/CYVR/*.xml')),
    *list(swob_dir.glob('20201128/CYQB/*.xml')),
]

In [None]:
bag = db.read_text(obs_files)
obs_dicts = bag.map(string_to_dict)
obs_df = obs_dicts.to_dataframe()

In [None]:
computed = obs_df.compute()

In [None]:
computed.head()

In [None]:
for col in ['lat', 'long', 'air_temp']:  
    computed[col] = pd.to_numeric(computed[col])
    
computed['date_tm'] = pd.to_datetime(computed['date_tm'])

In [None]:
tt_obs = computed.sort_values('date_tm')
tt_obs[['date_tm', 'lat', 'long', 'air_temp']]

In [None]:
sns.scatterplot(data=tt_obs, x='date_tm', y='air_temp', hue='stn_nam')

# Interpolate with K nearest neighbors + linear on time domain

In [None]:
rdps_path = pathlib.Path(RDPS_DIR)
temperature_files = [
    *[rdps_path / '2020112600' / 'CMC_reg_TMP_TGL_2_ps10km_2020112600_P{:03}.grib2'.format(t) for t in range(12, 37)],
]

In [None]:
tt = xr.open_mfdataset(temperature_files, engine='cfgrib', concat_dim='step', combine='nested')

In [None]:
tt

In [None]:
axis1 = pyinterp.Axis(tt.latitude.data)

In [None]:
interpolator = pyinterp.backends.xarray.Grid2D()

In [None]:
nn = pynanoflann.KDTree(n_neighbors=1, metric='L1', radius=2)

In [None]:
lat_lons = np.stack([tt.latitude, tt.longitude], axis=-1).reshape(-1, 2)

In [None]:
lat_lons.shape

In [None]:
nn.fit(lat_lons)

In [None]:
distance, index = nn.kneighbors(np.array([[48., 290.]]))

In [None]:
coords = np.unravel_index(index.squeeze(), shape=tt.latitude.shape)

In [None]:
tt.latitude[coords].compute()

In [None]:
tt.longitude[coords].compute()

In [None]:
before = tt.t2m.isel(step=0, y=coords[0], x=coords[1]).compute()
after = tt.t2m.isel(step=1, y=coords[0], x=coords[1]).compute()

In [None]:
before

In [None]:
after