In [None]:
%load_ext autoreload
%autoreload 2

# Metar Cube

Build an XArray datacube from the metar observations.
The datacube will have 2 dimensions: station and datetime.
It will have as many variables as we have fields in the observations.

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import datetime
import multiprocessing
import numpy as np
import os
import pathlib
import pandas as pd
import pymongo
import seaborn as sns
import xarray as xr

from tqdm.notebook import tqdm

from smc01.interpolate.dataset_generator import process_one_iem_obs
from smc01.interpolate.obs import MongoIEMDatabase

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

MONGO_HOST = 'localhost'
MONGO_USER = None
MONGO_PASS = None
MONGO_DB = 'smc01_raw_obs'
MONGO_COLLECTION = 'iem'

## Boot dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Utility functions

In [None]:
BEGIN = datetime.datetime(2019, 1, 1)
END = datetime.datetime(2021, 1, 1)

In [None]:
def pipeline_of_station(station, begin, end, tolerance=0):
    return [
        {
            '$addFields': {
                'minute': {'$minute': '$valid'},
                'hour': {'$hour': '$valid'}}
        },
        {
            '$match': {'$or': [
                {'minute': {'$lte': tolerance}},
                {'minute': {'$gte': 60 - tolerance}}
            ],
           'station': station,
           'valid': {'$gte': BEGIN - datetime.timedelta(days=1),
            '$lt': END + datetime.timedelta(days=1)},
           'tmpf': {'$exists': True}}}]

In [None]:
pipeline_of_station('YVR', BEGIN, END, tolerance=30)

In [None]:
def fetch_station_dataframe(station, begin, end):
    with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
        station_obs = list(client[MONGO_DB][MONGO_COLLECTION].aggregate(pipeline_of_station(station, begin, end, tolerance=30)))
    
    station_obs = [process_one_iem_obs(obs) for obs in station_obs]
            
    return pd.DataFrame(station_obs)

## Decide the list of stations

We will only keep stations that have good coverage.

In [None]:
with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)
    station_info = db.station_info()

In [None]:
station_info

### Take station from a coverage csv instead

See 033-ovservation-coverage.ipybn

In [None]:
coverage = pd.read_csv(DATA_DIR / '2021-04-23-coverages.csv')

In [None]:
coverage.head()

In [None]:
len(coverage)

In [None]:
stations = [s for s in coverage[coverage['coverage'] > 0.99]['station']]

We remove a small set of stations that have incorrect coverage. There are gaps in their observations even though they have good coverage of the years we target.
Ideally we would filter them automatically, but here we are.

In [None]:
stations = list(set(stations) - set(['BAD', 'CQC', 'OFP', 'RKD'])) 

In [None]:
len(stations)

In [None]:
coverage

## Interpolate observation at correct times

In [None]:
DATA_COLUMNS = ['obs_2r', 'obs_p01i', 'obs_2t']

def station_data(station, begin, end):
    df = fetch_station_dataframe(station, begin, end)
    
    if 'obs_valid' not in df.columns or 'obs_2t' not in df.columns or 'obs_2r' not in df.columns:
        return None
    
    valid = xr.DataArray(df['obs_valid'], dims=['valid'])
    station = xr.DataArray([station], dims=['station'])
    
    series = {}
    for col in DATA_COLUMNS:
        data_of_variable = df[col].to_numpy()
        
        series[col] = xr.DataArray(data_of_variable, dims=['valid'])
    
    dataset = xr.Dataset(series)
    return dataset.assign_coords(valid=valid)

In [None]:
def obs_of_forecast(station, begin, end):
    df = station_data(station, begin, end)
    
    if not df:
        return None
    
    forecast_valid = pd.date_range(start=BEGIN + datetime.timedelta(hours=3), end=END, freq='3H', closed='left')
        
    linear = df[['obs_2t', 'obs_2r']]
    linear_interp = linear.interp(valid=forecast_valid, method='linear')
    
    nearest = df[['obs_p01i']]
    nearest_interp = nearest.interp(valid=forecast_valid, method='nearest')
    
    merged = xr.merge([nearest_interp, linear_interp])
    
    return merged.expand_dims(station=[station])

In [None]:
cyul_interp = obs_of_forecast('CYUL', BEGIN, END)

In [None]:
cyul_interp

In [None]:
cyul_data = station_data('CYVR', BEGIN, END)

In [None]:
cyul_data

## Gap to nearest observation

This section computes, for every GDPS model output, the nearest observation for every station (in time).
The goal is to detect big gaps in the observation data to filter out stations that have gaps in their coverage.

In [None]:
def compute_obs_deltas(model_times, observations):
    model_times_np = model_times.to_numpy().reshape(1, -1)
    obs_times = observations.valid.data.reshape(-1, 1)
    
    deltas = np.abs(obs_times - model_times_np)
    best_delta = np.min(deltas, axis=0)
    
    return best_delta.astype('timedelta64[m]')

In [None]:
model_times = pd.date_range(start=BEGIN, end=END, freq='3H', closed='left')

In [None]:
obs = station_data('RKD', BEGIN, END)

In [None]:
deltas = compute_obs_deltas(model_times, obs)

In [None]:
best_obs_df = pd.DataFrame({'delta': deltas, 'model_time': model_times})

In [None]:
best_obs_df['delta'].value_counts().sort_index()

In [None]:
(deltas > np.timedelta64(20, 'm')).sum()

In [None]:
(deltas > np.timedelta64(60, 'm')).sum()

In [None]:
(deltas > np.timedelta64(120, 'm')).sum()

In [None]:
(deltas > np.timedelta64(12, 'h')).sum()

In [None]:
pd.DataFrame([{'test': 1}, None])

In [None]:
def delta_statistics_of_station(station, model_times):
    print(station)
    begin = model_times.min() - np.timedelta64(1, 'D')
    end = model_times.max() + np.timedelta64(1, 'D')
    
    obs = station_data(station, begin, end)
    
    if obs is None:
        return None
    
    deltas = compute_obs_deltas(model_times, obs)
    
    return {
        'station': station,
        '5m': (deltas > np.timedelta64(5, 'm')).sum(), 
        '10m': (deltas > np.timedelta64(10, 'm')).sum(), 
        '15m': (deltas > np.timedelta64(15, 'm')).sum(), 
        '20m': (deltas > np.timedelta64(20, 'm')).sum(), 
        '60m': (deltas > np.timedelta64(60, 'm')).sum(),
        '120m': (deltas > np.timedelta64(120, 'm')).sum(),
        '6h': (deltas > np.timedelta64(6, 'h')).sum(),
        '12h': (deltas > np.timedelta64(12, 'h')).sum(),
        '24h': (deltas > np.timedelta64(24, 'h')).sum()
    }

In [None]:
delta_statistics_of_station('CYUL', model_times)

### Run it for all stations

In [None]:
def do_one_station(station):
    return delta_statistics_of_station(station, model_times)

results = []
with multiprocessing.Pool(processes=12) as pool:
    for item in tqdm(pool.imap_unordered(do_one_station, coverage['station']), total=len(coverage)):
        results.append(item)

In [None]:
deltas_stats_df = pd.DataFrame([r for r in results if r is not None])

In [None]:
delta_statistics_of_station('CYVR', model_times)

In [None]:
deltas_stats_df

In [None]:
deltas_stats_df.to_csv(DATA_DIR / '2021-05-11-delta-statistics.csv', index=False)

In [None]:
multiprocessing.Pool??

## Run it for all stations

In [None]:
delayed_obs_of_forecast = dask.delayed(obs_of_forecast)

In [None]:
delayed_obs_of_forecast('CYVR', BEGIN, END).compute()

In [None]:
delayeds = [delayed_obs_of_forecast(station, BEGIN, END) for station in stations]

In [None]:
len(delayeds)

In [None]:
station_datasets = dask.compute(*delayeds)

In [None]:
metar_cube = xr.concat(station_datasets, dim='station')

In [None]:
metar_cube

In [None]:
metar_cube.isnull().sum(dim='station').obs_2t.plot()

In [None]:
metar_cube.sel(valid=slice("2020-12-30", "2020-12-31")).interpolate_na().isnull().sum(dim='station').obs_2t.plot()

In [None]:
null_mask = metar_cube.obs_2t.isnull().sum(dim='valid') > 0

In [None]:
stations_with_null = metar_cube.station[null_mask]

In [None]:
stations_with_null

In [None]:
metar_cube.to_netcdf(DATA_DIR / '2021-04-22-metarcube.netcdf')

In [None]:
metar_cube

In [None]:
coverage[coverage['station'].isin(['BAD', 'CQC', 'OFP', 'RKD'])]

In [None]:
coverage['station']

## Some validation

In [None]:
metarcube = xr.open_dataset(DATA_DIR / '2021-04-22-metarcube.netcdf')

In [None]:
metarcube

In [None]:
metarcube.obs_p01i.isnull().sum(dim='station').plot()

In [None]:
metarcube.obs_2t.mean(dim='station').plot()