# This notebook saves CAFE v1 ocean daily climatologies in a single dataset

#### IMPORTANT: If adapting this code to save other climatologies, be careful to only include full years, as pyLatte will compute monthly climatologies from the saved daily climatologies

In [24]:
import pandas as pd
import xarray as xr
import numpy as np
from pylatte import utils
from ipywidgets import FloatProgress

#### Initialise

In [25]:
# Location of forecast data -----
fcst_folder = '/OSM/CBR/OA_DCFP/data/model_output/CAFE/forecasts/v1/'
fcst_filename = 'ocean_daily*'

fields = pd.DataFrame( \
        {'name_CAFE': ['sst', 'patm_t', 'eta_t', 'sss', 'u_surf', 'v_surf'],
         'name_std' : ['sst', 'patm',   'eta',   'sss', 'u_s',    'v_s']}
                     )
fields = pd.DataFrame( \
        {'name_CAFE': ['sst'],
         'name_std' : ['sst']}
                     )

name_dict = fields.set_index('name_CAFE').to_dict()['name_std']

fields


Unnamed: 0,name_CAFE,name_std
0,sst,sst


In [26]:
# Initial dates to include (takes approximately 1 min 30 sec per date) -----
init_dates = pd.date_range('2002-2','2016-5' , freq='1MS')

# Ensembles to include -----
ensembles = range(1,12)

#### Load one 366 day long year to provide time array

In [27]:
path = fcst_folder + '/yr2016/mn1/OUTPUT.1/' + fcst_filename + '.nc'
dataset = xr.open_mfdataset(path, autoclose=True)
time_use = dataset.time[:366]

#### Save each year and variable separately due to memory considerations

In [28]:
years = range(2002,2017)
months = range(1,13)
ensembles = range(1,12)

for year in years:
    print(year)
    print('----------')
    for idx, variable in enumerate(fields['name_CAFE']):
        print(variable)
        
        savename = 'cafe.fcst.v1.ocean.' + fields['name_std'][idx] + '.' + str(year) + '.clim.nc'
        try:
            temp = xr.open_mfdataset('/OSM/CBR/OA_DCFP/data/intermediate_products/pylatte_climatologies/' + savename, autoclose=True)
            print('    Already exists')
        except:
            
            fcst_list = []
            for month in months:

                ens_list = []
                ens = []
                empty = True
                for ie, ensemble in enumerate(ensembles):

                    path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
                           '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'

                    # Try to stack ensembles into a list -----
                    try:
                        dataset = xr.open_mfdataset(path, autoclose=True)[variable]
                        if 'xt_ocean' in dataset.dims:
                            dataset = dataset.rename({'xt_ocean':'lon_t','yt_ocean':'lat_t'})
                        if 'xu_ocean' in dataset.dims:
                            dataset = dataset.rename({'xu_ocean':'lon_u','yu_ocean':'lat_u'})
                        ens_list.append(dataset.rename(fields['name_std'][idx]))
                        ens.append(ie+1)
                        empty = False
                    except OSError:
                        # File does not exist -----
                        pass

                # Concatenate ensembles -----
                if empty == False:
                    ens_object = xr.concat(ens_list, dim='ensemble')
                    ens_object['ensemble'] = ens

                    # Stack concatenated ensembles into a list for each month in a year -----                       
                    fcst_list.append(ens_object)

            # Concatenate all months within year -----
            ds = xr.concat(fcst_list, dim='time')
            print(ds)
            
            # Rechunk for chunksizes of at least 1,000,000 elements -----
            ds = utils.prune(ds.chunk(chunks={'ensemble' : len(ds.ensemble), 
                                              'time' : len(ds.time)}).squeeze())

            # Make month_day array of month-day -----
            m = np.array([str(i).zfill(2) + '-' for i in ds.time.dt.month.values])
            d = np.array([str(i).zfill(2)  for i in ds.time.dt.day.values])
            md = np.core.defchararray.add(m, d)

            # Replace time array with month_day array and groupby -----
            ds['time'] = md
            print(list(ds.groupby('time')))
            ds_clim = ds.groupby('time').mean(dim=['time','ensemble'],keep_attrs=True)

            # Fill time with presaved time -----
            ds_clim['time'] = time_use
            ds_clim.time.attrs['long_name'] = 'time'
            ds_clim.time.attrs['cartesian_axis'] = 'T'
            ds_clim.time.attrs['calendar_type'] = 'JULIAN'
            ds_clim.time.attrs['bounds'] = 'time_bounds'

            # Save and delete -----
            with utils.timer():
                ds_clim.to_netcdf(path='/OSM/CBR/OA_DCFP/data/intermediate_products/pylatte_climatologies/' + savename,
                                  mode = 'w',
                                  encoding = {'time':{'dtype':'float','calendar':'JULIAN',
                                                      'units':'days since 0001-01-01 00:00:00'}}) 

            del ds, ds_clim

2002
----------
sst
    Already exists
2003
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11694, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11694, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2003-01-01T12:00:00 2003-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (

  x = np.divide(x1, x2, out)


   Elapsed: 1802.7908458709717 sec
2004
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11684, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11684, 300, 360), dtype=float32, chunksize=(1, 366, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2004-01-01T12:00:00 2004-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_

  x = np.divide(x1, x2, out)


   Elapsed: 2777.33358168602 sec
2005
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11682, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11682, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2005-01-01T12:00:00 2005-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t)

  x = np.divide(x1, x2, out)


   Elapsed: 2022.2499117851257 sec
2006
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11692, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11692, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2006-01-01T12:00:00 2006-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_

  x = np.divide(x1, x2, out)


   Elapsed: 2438.787129163742 sec
2007
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11694, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11694, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2007-01-01T12:00:00 2007-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t

  x = np.divide(x1, x2, out)


   Elapsed: 2281.4974629879 sec
2008
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11684, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11684, 300, 360), dtype=float32, chunksize=(1, 366, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2008-01-01T12:00:00 2008-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t) 

  x = np.divide(x1, x2, out)


   Elapsed: 1986.96382021904 sec
2009
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11682, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11682, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2009-01-01T12:00:00 2009-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t)

  x = np.divide(x1, x2, out)


   Elapsed: 2019.6820714473724 sec
2010
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11692, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11692, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2010-01-01T12:00:00 2010-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_

  x = np.divide(x1, x2, out)


   Elapsed: 2557.32066321373 sec
2011
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11694, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11694, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2011-01-01T12:00:00 2011-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t)

  x = np.divide(x1, x2, out)


   Elapsed: 2178.823522090912 sec
2012
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11684, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11684, 300, 360), dtype=float32, chunksize=(1, 366, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2012-01-01T12:00:00 2012-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t

  x = np.divide(x1, x2, out)


   Elapsed: 2259.5690457820892 sec
2013
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11682, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11682, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2013-01-01T12:00:00 2013-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_

  x = np.divide(x1, x2, out)


   Elapsed: 2445.1253941059113 sec
2014
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11692, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11692, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2014-01-01T12:00:00 2014-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_

  x = np.divide(x1, x2, out)


   Elapsed: 2435.971914291382 sec
2015
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 11694, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 11694, 300, 360), dtype=float32, chunksize=(1, 365, 300, 360)>
Coordinates:
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * time      (time) datetime64[ns] 2015-01-01T12:00:00 2015-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 32, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 32, 300, 360), dtype=float32, chunksize=(11, 32, 300, 360)>
Coordinates:
  * lon_t     (lon_t

  x = np.divide(x1, x2, out)


   Elapsed: 2669.792043209076 sec
2016
----------
sst
<xarray.DataArray 'sst' (ensemble: 11, time: 5843, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 5843, 300, 360), dtype=float32, chunksize=(1, 366, 300, 360)>
Coordinates:
  * ensemble  (ensemble) int64 1 2 3 4 5 6 7 8 9 10 11
  * lon_t     (lon_t) float64 -279.5 -278.5 -277.5 -276.5 -275.5 -274.5 ...
  * lat_t     (lat_t) float64 -77.88 -77.63 -77.38 -77.13 -76.88 -76.63 ...
  * time      (time) datetime64[ns] 2016-01-01T12:00:00 2016-01-02T12:00:00 ...
Attributes:
    long_name:      Potential temperature
    units:          degrees C
    valid_range:    [ -10.  100.]
    cell_methods:   time: mean
    time_avg_info:  average_T1,average_T2,average_DT
    coordinates:    geolon_t geolat_t
    standard_name:  sea_surface_temperature
[('01-01', <xarray.DataArray 'sst' (ensemble: 11, time: 16, lat_t: 300, lon_t: 360)>
dask.array<shape=(11, 16, 300, 360), dtype=float32, chunksize=(11, 16, 300, 360)>
Coordinates:
  * ensemble  (ensembl

  x = np.divide(x1, x2, out)


   Elapsed: 902.1821012496948 sec
