# Prepare dataset

The purpose of this notebook is to prepare a statistical downscaling dataset
using only Xarray to open grib files.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask
import dask.array as da
import dask.bag as db
import dask_jobqueue
import dask.distributed
import datetime
import itertools
import matplotlib.pyplot as plt
import os
import numpy as np
import pathlib
import pandas as pd
import pymongo
import seaborn as sns
import time
import xarray as xr

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
GDPS_DIR = DATA_DIR / '2021-02-02-one-week-sample/'

MONGO_URL = 'localhost'
MONGO_PORT = 27017
USERNAME = None
PASSWORD = None
ADMIN_DB = 'admin'
DB = 'smc01_raw_obs_test'
COLLECTION = 'iem'

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=[
        'source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=8)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
gdps_files = sorted(list(pathlib.Path(GDPS_DIR).glob('CMC_glb_latlon.24x.24_*.grib2')))

In [None]:
def nest_filenames(files):
    passes = {}
    for f in files:
        pass_name = f.stem[22:32]
        
        pass_list = passes.get(pass_name, [])
        pass_list.append(f)
        passes[pass_name] = pass_list
        
    sorted_passes = sorted(passes.keys())
        
    return [passes[k] for k in sorted_passes]

In [None]:
nested_files = nest_filenames(gdps_files)

In [None]:
gdps = xr.open_mfdataset(
    nested_files, engine='cfgrib', concat_dim=['time', 'step'], 
    combine='nested', parallel=True, compat='no_conflicts',
    backend_kwargs={'filter_by_keys': {
        'typeOfLevel': 'heightAboveGround',
        'stepType': 'instant',
}})

In [None]:
gdps_surface = xr.open_mfdataset(
    nested_files, engine='cfgrib', concat_dim=['time', 'step'], 
    combine='nested', parallel=True, compat='no_conflicts',
    backend_kwargs={'filter_by_keys': {
        'typeOfLevel': 'surface',
        'stepType': 'instant',
}})

In [None]:
gdps_iso = xr.open_mfdataset(
    nested_files, engine='cfgrib', concat_dim=['time', 'step'], 
    combine='nested', parallel=True, compat='no_conflicts',
    backend_kwargs={'filter_by_keys': {
        'typeOfLevel': 'isobaricInhPa',
        'stepType': 'instant',
}})

In [None]:
gdps

# 2. Fetch station coordinates

In [None]:
begin_date = gdps.valid_time.min().data.item()
begin_date = datetime.datetime.utcfromtimestamp(begin_date // 1e9)

In [None]:
end_date = gdps.valid_time.max().data.item()
end_date = datetime.datetime.utcfromtimestamp(end_date // 1e9)

In [None]:
mongo_client = pymongo.MongoClient(host=MONGO_URL, port=MONGO_PORT, username=USERNAME, password=PASSWORD, authSource=ADMIN_DB)

In [None]:
db = mongo_client.smc01_raw_obs_test

In [None]:
collection = db.iem

In [None]:
query = {
    'valid': {
        '$gte': begin_date + datetime.timedelta(days=1),
        '$lt': end_date
}}

In [None]:
stations = collection.distinct('station')

In [None]:
station_infos = []

for station in stations:
    one_obs = collection.find_one({'station': station})
    station_infos.append({
        'station': station,
        'lat': one_obs['lat'],
        'lon': one_obs['lon'],
        'elevation': one_obs['elevation']
    })

In [None]:
station_df = pd.DataFrame(station_infos)

In [None]:
station_df

# 3. Interpolate model at stations

In [None]:
at_stations = gdps.interp({
    'latitude': xr.DataArray(station_df['lat'], dims='station'),
    'longitude': xr.DataArray(station_df['lon'], dims='station'),
})

In [None]:
at_stations = at_stations.compute()