In [None]:
%load_ext autoreload
%autoreload 2

# Download Raw Obs

Since the organizers' processing wasn't perfect on the precipitation data, we will download the raw obs and dow our own version of forecast-like-observations for precipitation.

In [None]:
import calplot
import climetlab as cml
import datetime
import numpy as np
import pandas as pd
import pathlib
import xarray as xr

import climetlab_s2s_ai_challenge

In [None]:
cml.settings

In [None]:
pr = cml.load_dataset('s2s-ai-challenge-observations', parameter=['pr']).to_xarray()

In [None]:
pr.to_netcdf('***BASEDIR***/pr_raw_obs.nc')

In [None]:
pr

In [None]:
pr.sel(time='2008-05-30').pr.plot()

## Find dates with odd nulls

In [None]:
null_counts = pr.isnull().sum(dim=['latitude', 'longitude'])

In [None]:
null_counts.pr.plot()

In [None]:
null_counts_pd = pd.Series(data=null_counts.pr.data, index=null_counts.time.data)

In [None]:
calplot.calplot(null_counts_pd)

There are problems in:
* september 04
* feb 08
* dec 11
* feb 12
* sept 17
* feb 18

In [None]:
(pr.sel(time=slice('2008-02-01', '2008-03-28')).isnull().sum(dim=['latitude', 'longitude']) - pr.sel(time='2021-02-01').isnull().sum()).pr.plot()

In [None]:
months = [
    datetime.datetime(2004, 9, 1),
    datetime.datetime(2008, 2, 1),
    datetime.datetime(2011, 12, 1),
    datetime.datetime(2012, 2, 1),
    datetime.datetime(2017, 9, 1),
    datetime.datetime(2018, 2, 1),
]

dates_with_missing = []
for date in months:
    dates_with_missing.append(pr.sel(time=slice(date, date + datetime.timedelta(days=32))).isnull().sum(dim=['latitude', 'longitude']).idxmax().pr.data)

In [None]:
dates_with_missing

In the calendar I counted 9 faulty dates so i'll just find the arxmax 9 times.

In [None]:
daily_nulls = pr.isnull().sum(dim=['latitude', 'longitude'])

In [None]:
for _ in range(9):
    date = daily_nulls.idxmax().pr.data
    print(date)
    daily_nulls = daily_nulls.sel(time=date)['pr'] = 0

In [None]:
interpolated = pr.interpolate_na(dim='time', max_gap='365D')

In [None]:
null_counts = interpolated.isnull().sum(dim=['latitude', 'longitude'])

In [None]:
null_counts_pd = pd.Series(data=null_counts.pr, index=null_counts.time.data)

In [None]:
null_counts_pd

In [None]:
calplot.calplot(null_counts_pd)

## Fixing more weird dates

Here we see that there are less nans than usual in july of 06 and more than usual in jan of 2011. Let's validate those manually.

In [None]:
date = datetime.datetime(2006, 7, 1)
pr.sel(time=slice(date, date + datetime.timedelta(days=32))).isnull().sum(dim=['latitude', 'longitude']).pr.idxmin()

In [None]:
pr.sel(time='2006-07-03').pr.plot()

In [None]:
pr.sel(time='2006-07-02').pr.plot()

Looks like an interpolation glitch or something. I'll just reapply the land mask to fix it.

In [None]:
pr.sel(time='2006-07-03').where(pr.sel(time='2006-07-02').isnull())['pr']= np.nan

In [None]:
faulty_pr = pr.sel(time='2006-07-03').pr

In [None]:
sea_mask = interpolated.sel(time='2001-01-01').isnull().pr

In [None]:
sea_mask.sum()

In [None]:
faulty_pr.where(~sea_mask, np.nan).isnull().sum()

In [None]:
remasked = interpolated.where(~sea_mask, np.nan)

In [None]:
sea_mask_null_count = remasked.isnull().sum(dim=['latitude', 'longitude'])

In [None]:
sea_mask_null_count

In [None]:
sea_mask_null_count_pd = pd.Series(sea_mask_null_count.pr, index=sea_mask_null_count.time.data)

In [None]:
sea_mask_null_count_pd

In [None]:
calplot.calplot(sea_mask_null_count_pd)

So that fixed the july of 06 problem. Now for the problem in jan of 11. Also a new gap emerged: in may of 08.

In [None]:
remasked.sel(time=slice('2007-01-01', '2007-04-01')).isnull().sum(dim=['latitude', 'longitude']).pr.plot()

In [None]:
remasked.isnull().sum(dim=['latitude', 'longitude']).idxmax()

In [None]:
remasked.sel(time='2007-02-23').pr.plot()

In [None]:
remasked.sel(time='2011-01-05').isnull().sum()

In [None]:
remasked.sel(time='2011-01-04').isnull().sum()

In [None]:
null05 = interpolated.sel(time='2011-02-23').isnull()

In [None]:
null05.pr.plot()

In [None]:
sea_mask.plot()

In [None]:
(pr.sel(time='2015-01-04').pr.isnull() ^ ~sea_mask).plot()

A part of west africa is missing, from 2008-05-21 to 2008-05-30.
Once again I think I'll just extend the interpolation... We don't want nans in there.

In [None]:
remasked.isel(time=0).pr

In [None]:
pr.isel(time=0).pr

In [None]:
diff = pr.isnull() != remasked.isnull()

In [None]:
diff.sum(dim=['latitude', 'longitude']).pr.plot()

In [None]:
diff

In [None]:
remasked.assign({'pr_interpolated': diff.pr}).to_netcdf('***BASEDIR***pr-obs-interpolated.nc')

In [None]:
remasked

# Redo forecast-like obs using the fixed precip

In [None]:
fix_pr = xr.open_dataset('***BASEDIR***pr-obs-interpolated.nc')

In [None]:
fix_pr.pr_interpolated.sum(dim=['latitude', 'longitude']).plot()

In [None]:
forecast_path = pathlib.Path('***BASEDIR***/training-input/')

In [None]:
forecast_files = [f for f in forecast_path.iterdir() if 'ecmwf' in f.stem and 'tp' in f.stem]

In [None]:
forecast_files[:10]

In [None]:
forecast = xr.open_dataset(forecast_files[0])

In [None]:
newobs = climetlab_s2s_ai_challenge.extra.forecast_like_observations(forecast, fix_pr.drop('pr_interpolated'))

In [None]:
newobs

In [None]:
newmask = climetlab_s2s_ai_challenge.extra.forecast_like_observations(forecast, fix_pr.drop('pr'))

In [None]:
newmask

In [None]:
newmask.pr_interpolated.sum(dim=['latitude', 'longitude', 'lead_time']).plot()

In [None]:
fix_pr.pr_interpolated.sum(dim=['latitude', 'longitude']).plot()