In [None]:
%load_ext autoreload
%autoreload 2

# Observation delta

The purpose of this notebook is to compute the delta, in minutes, between a model output and the closest observation for a station.
This data is useful to decide when it is reasonable to interpolate between two observations, and when there are large gaps in the observations that make the data invalid.

Using this data, in the second part of the notebook we make a selection of stations that we will keep for our post processing dataset, because they don't show huge gaps.

In [None]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature 
import matplotlib.pyplot as plt
import os
import pathlib
import numpy as np
import pandas as pd
import pymongo
import seaborn as sns

from smc01.interpolate.obs import MongoIEMDatabase

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_USER = None
MONGO_PASS = None
MONGO_DB = 'smc01_raw_obs'
MONGO_COLLECTION = 'iem'
ADMIN_DB = "admin"

In [None]:
def compute_obs_deltas(model_times, observations):
    model_times_np = model_times.to_numpy().reshape(1, -1)
    obs_times = observations.valid.data.reshape(-1, 1)
    
    deltas = np.abs(obs_times - model_times_np)
    best_delta = np.min(deltas, axis=0)
    
    return best_delta.astype('timedelta64[m]')

In [None]:
def delta_statistics_of_station(station, model_times):
    print(station)
    begin = model_times.min() - np.timedelta64(1, 'D')
    end = model_times.max() + np.timedelta64(1, 'D')
    
    obs = station_data(station, begin, end)
    
    if obs is None:
        return None
    
    deltas = compute_obs_deltas(model_times, obs)
    
    return {
        'station': station,
        '5m': (deltas > np.timedelta64(5, 'm')).sum(), 
        '10m': (deltas > np.timedelta64(10, 'm')).sum(), 
        '15m': (deltas > np.timedelta64(15, 'm')).sum(), 
        '20m': (deltas > np.timedelta64(20, 'm')).sum(), 
        '60m': (deltas > np.timedelta64(60, 'm')).sum(),
        '120m': (deltas > np.timedelta64(120, 'm')).sum(),
        '6h': (deltas > np.timedelta64(6, 'h')).sum(),
        '12h': (deltas > np.timedelta64(12, 'h')).sum(),
        '24h': (deltas > np.timedelta64(24, 'h')).sum()
    }

# Make selection

In [None]:
df = pd.read_csv(DATA_DIR / '2021-05-11-delta-statistics.csv')

In [None]:
df

In [None]:
filtered = df[df['24h'] == 0]  # We can't have gaps of more than 24hrs.

In [None]:
filtered

In [None]:
filtered = filtered[filtered['60m'] < 48]  # We allow a 1hrs gap about twice a month.

In [None]:
filtered

In [None]:
filtered = filtered[filtered['12h'] < 2]  # We allow a 12hrs gap about once a year.

In [None]:
filtered

In [None]:
filtered[filtered['station'] == 'CYUL']

In [None]:
filtered['20m'].value_counts()

In [None]:
filtered

In [None]:
filtered.to_csv(DATA_DIR / '2021-05-11-selected-with-statistics.csv', index=False)

### Visualize selection

In [None]:
with pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT, username=MONGO_USER, password=MONGO_PASS, authSource=ADMIN_DB) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)
    station_info = db.station_info(stations=filtered['station'])

In [None]:
station_info.columns

In [None]:
proj = ccrs.NearsidePerspective(central_longitude=-73.0, central_latitude=48)


In [None]:
ax = plt.axes(projection=proj)
plt.gcf().set_size_inches(12,9)

ax.add_feature(cfeature.LAND) 
ax.scatter(x=station_info['lon'], y=station_info['lat'], transform=ccrs.PlateCarree())
ax.set_title('Location of selected stations')
plt.gcf().savefig('smc01_stations.png', dpi=200)

In [None]:
plt.gfc().savefig??

In [None]:
plt.gfc().savefig