# GDPS Metar Error

The purpose of this notebook is to compute the error of the GDPS model with respect to canadian Metar observations.

In [None]:
import dask.array as da
import dask_jobqueue
import dask.distributed
import datetime
import itertools
import matplotlib.pyplot as plt
import os
import numpy as np
import pathlib
import pandas as pd
import xarray as xr

from pymongo import MongoClient

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
GDPS_DIR = DATA_DIR / '2021-02-02-one-week-sample/'

MONGO_URL = 'localhost'
MONGO_PORT = 27017
USERNAME = None
PASSWORD = None
ADMIN_DB = 'admin'
DB = 'smc01_raw_obs_test'
COLLECTION = 'iem'

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=[
        'source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=4)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

# Fetch GDPS data

In [None]:
gdps_files = sorted(list(pathlib.Path(GDPS_DIR).glob('CMC_glb_latlon.24x.24_*.grib2')))

In [None]:
def nest_filenames(files):
    passes = {}
    for f in files:
        pass_name = f.stem[22:32]
        
        pass_list = passes.get(pass_name, [])
        pass_list.append(f)
        passes[pass_name] = pass_list
        
    sorted_passes = sorted(passes.keys())
        
    return [passes[k] for k in sorted_passes]

In [None]:
nested_files = nest_filenames(gdps_files)

In [None]:
gdps = xr.open_mfdataset(
    nested_files, engine='cfgrib', concat_dim=['time', 'step'], 
    combine='nested', parallel=True, compat='no_conflicts',
    backend_kwargs={'filter_by_keys': {
        'shortName': '2t'
}})

In [None]:
gdps

In [None]:
mean_temp_by_step = gdps.groupby('step').mean(...).compute()

In [None]:
plt.plot(mean_temp_by_step.t2m.data)

# Fetch observation data

In [None]:
begin_date = gdps.valid_time.min().data.item()
begin_date = datetime.datetime.fromtimestamp(begin_date // 1e9)

In [None]:
end_date = gdps.valid_time.max().data.item()
end_date = datetime.datetime.fromtimestamp(end_date // 1e9)

In [None]:
begin_date, end_date

In [None]:
mongo_client = MongoClient(host=MONGO_URL, port=MONGO_PORT, username=USERNAME, password=PASSWORD, authSource=ADMIN_DB)

In [None]:
db = mongo_client.smc01_raw_obs_test

In [None]:
collection = db.iem

In [None]:
query = {
    'valid': {
        '$gte': begin_date + datetime.timedelta(days=1),
        '$lt': end_date
}}

In [None]:
collection.count_documents(query)

In [None]:
collection.find_one(query)

In [None]:
observations = list(collection.find(query))

In [None]:
df = pd.DataFrame(observations)

In [None]:
df['tmp'] = (df['tmpf'] - 32) * 5/9

In [None]:
df

In [None]:
df[df.valid.dt.minute == 0]

# Interpolate at one station

In [None]:
one_obs = df.iloc[701]

In [None]:
one_obs

In [None]:
d = gdps_dataset.set_index(step='valid_time')

In [None]:
d

In [None]:
d.interp(latitude=one_obs['lat'], longitude=one_obs['lon'], step=one_obs['valid']).compute()

In [None]:
at_stations = d.interp(
    step=xr.DataArray(df['valid'], dims='station'), 
    latitude=xr.DataArray(df['lat'], dims='station'), 
    longitude=xr.DataArray(df['lon'], dims='station'))

In [None]:
at_stations

In [None]:
computed_temps = at_stations.t2m.compute()

In [None]:
np.sqrt(np.square(computed_temps.data - df['tmp'] - 273.15).mean())

In [None]:
xr.DataArray(df['valid'], dims='station')

In [None]:
at_locations = d.interp(
    latitude=xr.DataArray(df['lat'], dims='obs'),
    longitude=xr.DataArray(df['lon'], dims='obs'))

In [None]:
at_locations

In [None]:
at_locations.interp(
    step=xr.DataArray(df['valid'], dims='obs')
).t2m[1000:1010].compute()

# Integration: fetch data from group by

In [None]:
gdps.groupby('valid_time')

In [None]:
valid_time, dataset = next(iter(gdps.groupby('valid_time')))
valid_time = datetime.datetime.utcfromtimestamp(valid_time.tolist() / 1e9)

In [None]:
pipeline = [
    {
        '$addFields': {
            'minute': {
                '$minute': '$valid'
            }
        }
    },
    {'$match': {
        'minute': 0,
        '$gte': valid_time - datetime.timedelta(minutes=15),
        '$lt': valid_time + datetime.timedelta(minutes=15),
    }}
]

In [None]:
valid_time, group = next(iter(gdps.groupby('valid_time')))
valid_time = datetime.datetime.utcfromtimestamp(valid_time.tolist() / 1e9)

In [None]:
print(valid_time)    
query = {
    'valid': {
        '$eq': valid_time
    },
    'tmpf': {
        '$exists': True
    }
}
df = pd.DataFrame(collection.find(query))
df['temp'] = (df['tmpf'] - 32) * 5/9

at_stations = group.interp(
    latitude=xr.DataArray(df['lat'], dims='station'), 
    longitude=xr.DataArray(df['lon'], dims='station'))

observed = xr.DataArray(df['temp'], dims='station')

temps_at_stations = (at_stations.t2m - 273.15).compute()

In [None]:
df[df['tmpf'].isna()]

In [None]:
observed

In [None]:
(temps_at_stations - observed).mean()

In [None]:
observed.shape

In [None]:
temps_at_stations - observed

In [None]:
df['temp']

In [None]:
temps_at_stations - df['temp'].to_numpy()

In [None]:
for valid_time, group in gdps.groupby('valid_time'):
    valid_time = datetime.datetime.utcfromtimestamp(valid_time.tolist() / 1e9)
    print(valid_time)    
    query = {
        'valid': {
            '$eq': valid_time
        },
        'tmpf': {
            '$exists': True
        }
    }
    df = pd.DataFrame(collection.find(query))
    df['temp'] = (df['tmpf'] - 32) * 5/9
    
    at_stations = group.interp(
        latitude=xr.DataArray(df['lat'], dims='station'), 
        longitude=xr.DataArray(df['lon'], dims='station')) - 273.15
    
    observed = xr.DataArray(df['temp'], dims='station')
    
    error = (((at_stations - observed)**2).mean('station'))**(0.5)
    
    print(error.time.data)
    print(error.step.data / (1e9 * 60 * 60))
    print(error.t2m.data.compute())
    print('---')