In [None]:
%load_ext autoreload
%autoreload 2

# Observation coverage

Compute the coverage of station observations -- how well a period is covered by the observations.
The way we are going to do this is by counting the amount of days that are sufficiently covered. 
For instance, to evaluate a GDPS output, we need at least 8 observations in a day.
Thus, we could count the number of days that have at least 8 observations and considered these are well covered.
The percentage of well covered days will give us a good indication of which stations are usable and which aren't.

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import datetime
import numpy as np
import os
import pathlib
import pandas as pd
import pymongo
import xarray as xr

from tqdm.notebook import tqdm

from smc01.interpolate.dataset_generator import process_one_iem_obs
from smc01.interpolate.obs import MongoIEMDatabase

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

MONGO_HOST = 'localhost'
MONGO_USER = None
MONGO_PASS = None
MONGO_DB = 'smc01_raw_obs'
MONGO_COLLECTION = 'iem'

In [None]:
BEGIN = datetime.datetime(2018, 12, 31)
END = datetime.datetime(2021, 1, 2)

## Boot dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=1)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Utility functions

In [None]:
def pipeline_of_station(station, begin, end, tolerance=0):
    return [
        {
            '$addFields': {
                'minute': {'$minute': '$valid'},
                'hour': {'$hour': '$valid'}}
        },
        {
            '$match': {'$or': [
                {'minute': {'$lte': tolerance}},
                {'minute': {'$gte': 60 - tolerance}}
            ],
           'station': station,
           'valid': {'$gte': BEGIN - datetime.timedelta(minutes=tolerance),
            '$lt': END + datetime.timedelta(minutes=tolerance)},
           'tmpf': {'$exists': True}
        }}]

In [None]:
def fetch_station_dataframe(station, begin, end):
    with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
        station_obs = list(client[MONGO_DB][MONGO_COLLECTION].aggregate(pipeline_of_station(station, begin, end, tolerance=10)))
    
    station_obs = [process_one_iem_obs(obs) for obs in station_obs]
            
    return pd.DataFrame(station_obs)

In [None]:
def robust_fetch(station, begin, end):
    station_df = fetch_station_dataframe(station, begin, end)
    
    present_columns = set(station_df.columns)
    desired_columns = ['station', 'obs_valid', 'obs_2t']
                
    if set(desired_columns).issubset(present_columns):
        return station_df[desired_columns]
    else:
        return pd.DataFrame(columns=desired_columns)

In [None]:
def one_station_coverage(station, begin, end):
    station_df = robust_fetch(station, begin, end)
    
    if len(station_df) == 0:
        return station, 0.0
    
    counts = station_df.groupby(station_df.obs_valid.dt.date).agg('count')
    coverage = counts[counts['obs_2t'] > 8].count() / (END - BEGIN).days
    
    return station, coverage['obs_2t']

## Compute coverage

### Get full list of stations

In [None]:
with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)
    station_info = db.station_info()

### Compute coverage for each

In [None]:
coverage_delayed = dask.delayed(one_station_coverage)

In [None]:
delayeds = [coverage_delayed(s, BEGIN, END) for s in station_info['station']]

In [None]:
coverages = dask.compute(*delayeds)

In [None]:
coverage_df = pd.DataFrame([{'station': x[0], 'coverage': x[1]} for x in coverages])

In [None]:
(coverage_df == 1.0).sum()

In [None]:
coverage_df.to_csv(DATA_DIR / '2021-04-23-coverages.csv', index=False)

## Make station info list with stations that have good coverage.

In [None]:
coverages = pd.read_csv(DATA_DIR / '2021-04-23-coverages.csv')

In [None]:
coverages

In [None]:
selection = coverages[coverages['coverage'] > 0.995]

In [None]:
selection

In [None]:
documents = []

with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)

    documents = db.station_info(selection['station'])

In [None]:
documents

In [None]:
documents.to_csv(DATA_DIR / '2021-05-11-selected-stations.csv', index=None)

In [None]:
!cat ${DATA_DIR}/2021-05-11-selected-stations.csv | head

In [None]:
begin = datetime.datetime(2020, 1, 26)
end = datetime.datetime(2020, 1, 28)

with pymongo.MongoClient(host=MONGO_HOST, username=MONGO_USER, password=MONGO_PASS) as client:
    db = MongoIEMDatabase(client, db=MONGO_DB, collection=MONGO_COLLECTION)

    documents = db.station_observations('CYUL', begin, end, tolerance=20)

In [None]:
documents