In [None]:
%load_ext autoreload
%autoreload 2

# Investigate land mask

In the precipitation observations, it seems like the land mask was modified during July 2006.
Consequently, I would like to revise the land mask for both temperature and precipitation.
Since the land mask is larger after July 2006, my main idea is to simply remove the supplementary tiles from the mask at latter dates.

On top of investigating, this notebook produces two new observation files where we make the land sea masks consistent across the datasets.

In [None]:
import calplot
import climetlab as cml
import dask
import dask.array as da
import dask.distributed
import dask_jobqueue
import numpy as np
import pandas as pd
import pathlib
import xarray as xr

from crims2s.util import fix_dataset_dims

In [None]:
DATA_DIR = '***BASEDIR***/'
data_path = pathlib.Path(DATA_DIR)

## Boot dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(env_extra=['source ***HOME***.bash_profile','conda activate s2s'])

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Land sea mask

In [None]:
training_path = data_path / 'training-input'
lsm_files = [f for f in training_path.iterdir() if 'lsm' in f.stem and 'ecmwf' in f.stem]

In [None]:
lsm_files[:10]

In [None]:
lsm = xr.open_mfdataset(lsm_files, preprocess=fix_dataset_dims)

In [None]:
lsm

In [None]:
lsm.isel(lead_time=1, realization=0, forecast_dayofyear=0, forecast_year=0).lsm.plot()

In [None]:
lsm_sea_mask = lsm.lsm == 0.0

In [None]:
lsm_sea_mask.isel(forecast_year=0, forecast_dayofyear=0, lead_time=1, realization=0).plot()

In [None]:
(lsm.lsm.isel(lead_time=1) == 0.0).sum(dim=['latitude', 'longitude', 'realization']).plot()

In [None]:
(lsm.lsm == 0.0).sum(dim=['latitude', 'longitude', 'realization', 'forecast_dayofyear', 'forecast_year']).plot()

It looks like the number of purely sea tiles changes after a certain lead time. Let's investigate this further.

In [None]:
zero_by_lead = (lsm.lsm == 0.0).sum(dim=['latitude', 'longitude', 'realization', 'forecast_dayofyear', 'forecast_year'])

In [None]:
zero_by_lead.idxmin(dim='lead_time').compute().astype('timedelta64[D]').item() / (3600 * 24 * 1e9)

Lead time 16 days seems to begin to be wrong.

In [None]:
(lsm.lsm.sel(lead_time='15D') == 0.0).sum().compute()

In [None]:
(lsm.lsm.sel(lead_time='16D') == 0.0).sum().compute()

In [None]:
lead_0_mask = (lsm.lsm.sel(lead_time='15D').isel(forecast_year=0, forecast_dayofyear=0) == 0)

In [None]:
lead_16_mask = (lsm.lsm.sel(lead_time='16D').isel(forecast_year=0, forecast_dayofyear=0) == 0)

In [None]:
(lead_0_mask ^ ~lead_16_mask).plot()

In [None]:
(lsm.lsm == 0.0).sum(dim=['latitude', 'longitude', 'forecast_dayofyear', 'realization']).plot(hue='forecast_year')

Behavior is consisten for every year... The model has less tiles where the land sea mask equals zero after 16 days. Too keep in mind.

## Temperature observations

In [None]:
t2m_raw_obs = xr.open_dataset(data_path / 'obs_t2m_raw.nc', chunks=-1)

In [None]:
t2m_raw_obs

In [None]:
t2m_raw_obs.isel(time=0).t2m.plot()

In [None]:
t2m_raw_obs.isnull().sum(dim=['latitude', 'longitude']).t2m.plot()

Once again there seems to be a series of gaps.

In [None]:
interpolated = t2m_raw_obs.interpolate_na(dim='time', max_gap='30D')

In [None]:
interpolated.isnull().sum(dim=['latitude', 'longitude']).t2m.plot()

In [None]:
interpolated.sel(time=slice(None, '2020-01-01')).isnull().sum(dim=['latitude', 'longitude']).idxmax(dim='time').compute()

There seems to be a different grid during part of 2006. I think I need a calenplot.

In [None]:
null_counts = interpolated.sel(time=slice(None, '2020-01-01')).isnull().sum(dim=['latitude', 'longitude']).compute()

In [None]:
null_counts_pandas = pd.Series(null_counts.t2m, index=null_counts.time.data)

In [None]:
calplot.calplot(null_counts_pandas)

Actually it's for all of 2006, and then starting in 2014.

In [None]:
grid2006 = interpolated.sel(time='2006-01-01').isnull()

In [None]:
grid2000 = interpolated.sel(time='2001-01-01').isnull()

In [None]:
grid2014 = interpolated.sel(time='2014-01-01').isnull()

In [None]:
(grid2000 ^ ~grid2006).t2m.plot()

In [None]:
(grid2000 ^ ~grid2014).t2m.plot()

In [None]:
(grid2006 ^ ~grid2014).t2m.plot()

In [None]:
(grid2006 ^ ~lead_16_mask).t2m.plot()

Here I decide to apply the smallest grid on all the obs, to keep it consisted. 
This is to be validated with the organizers at some point.

In [None]:
grid2000.t2m.plot()

In [None]:
remasked = interpolated.where(~grid2006.t2m)

In [None]:
remasked

In [None]:
remasked.isnull().sum(dim=['latitude', 'longitude']).t2m.plot()

In [None]:
remasked.sel(time=slice(None, '2021-01-01')).to_netcdf('***BASEDIR***obs_t2m_interp_remask.nc')

In [None]:
!cp '***BASEDIR***obs_t2m_interp_remask.nc' {DATA_DIR}

## Precipitation observations

In [None]:
pr = xr.open_dataset(data_path / 'pr_raw_obs.nc', chunks='auto')

In [None]:
interpolated = pr.interpolate_na(dim='time', max_gap='300D')  # Gap has to be generous for some small pathological tiles. Big errors last only one day so its fine.

In [None]:
interpolated

In [None]:
null_counts = interpolated.sel(time=slice(None, '2020-01-01')).isnull().sum(dim=['latitude', 'longitude']).compute()

In [None]:
null_counts.pr.plot()

In [None]:
null_counts_pd = pd.Series(null_counts.pr, index=null_counts.time.data)

In [None]:
calplot.calplot(null_counts_pd)

Seems to have two grids, one up to july 2006 and one for the rest.

In [None]:
pr_grid_2000 = interpolated.sel(time='2000-01-01').isnull()
pr_grid_2017 = interpolated.sel(time='2017-01-01').isnull()

In [None]:
(pr_grid_2000 != pr_grid_2017).pr.plot()

Here I choose to apply the smallest grid to all data, so that it is consistent across the dataset.
This is to be validated with the organizers at some point, but it should do for now.
We can still make the trainer so that it backpropagates only where we have data.

In [None]:
remasked = interpolated.where(~pr_grid_2000)

In [None]:
remasked.isnull().sum(dim=['latitude', 'longitude']).pr.plot()

In [None]:
remasked.to_netcdf('***BASEDIR***obs_pr_interp_remask.nc')

In [None]:
!cp '***BASEDIR***obs_pr_interp_remask.nc' '***BASEDIR***/'

In [None]:
(pr_grid_2000.pr != grid2006.t2m).plot()

In [None]:
pr_grid_2000.pr.plot()

In [None]:
grid2006.t2m.plot()