# This notebook processes single-level CAFE v1 atmospheric daily data for building into climatologies. Only the first 2 years of the forecasts from the period 2003-2015 are used.
Temporary files are written to `tmp_fldr` and are later combined using `combine_CAFE_fcst_v1_atmos_climatology.ipynb`. Note that this script, and `build_CAFE_fcst_v1_atmos_climatology_multi_level.ipynb` should be run before the combine code is run

#### IMPORTANT: If adapting this code to save other climatologies, be careful to only include full years, as pyLatte will compute monthly climatologies from the saved daily climatologies

In [None]:
# Import packages -----
import pandas as pd
import xarray as xr
import numpy as np
from pylatte import utils
from ipywidgets import FloatProgress

#### Initialise

In [None]:
# Location of forecast data -----
fcst_folder = '/OSM/CBR/OA_DCFP/data/model_output/CAFE/forecasts/v1/'
fcst_filename = 'atmos_daily*'

fields = pd.DataFrame( \
        {'name_CAFE': ['lwflx', 'shflx', 'tau_x', 'tau_y', 't_ref', 'q_ref', 'u_ref', 'v_ref', 't_ref_min',
                       't_ref_max', 't_surf', 'ps',  'slp', 'h500', 'precip', 'lwdn_sfc', 'lwup_sfc', 'olr',  
                       'swdn_sfc', 'swup_sfc', 'swup_toa',   'high_cld_amt', 'low_cld_amt', 'mid_cld_amt', 'tot_cld_amt'],
         'name_std' : ['lwf',   'shf',   'tau_x', 'tau_y', 't_ref', 'q_ref', 'u_ref', 'v_ref', 't_ref_min',
                       't_ref_max', 't_s',    'p_s', 'slp', 'h500', 'precip', 'lwf_dn_s', 'lwf_up_s', 'olwr', 
                       'swf_dn_s', 'swf_up_s', 'swf_up_toa', 'high_cld_amt', 'low_cld_amt', 'mid_cld_amt', 'tot_cld_amt']}
                     )
name_dict = fields.set_index('name_CAFE').to_dict()['name_std']

fields


#### Load one 366 day long year to provide time array

In [None]:
# Use 2016, mn1, ensemble 1 -----
path = fcst_folder + '/yr2016/mn1/OUTPUT.1/' + fcst_filename + '.nc'
dataset = xr.open_mfdataset(path, autoclose=True)
time_use = dataset.time[:366]

#### Save each year and variable separately due to memory considerations

In [None]:
# Temporary folder location -----
tmp_fldr = '/OSM/CBR/OA_DCFP/data/intermediate_products/pylatte_climatologies/tmp/'

In [None]:
# Loop over all forecast years and variables -----
years = range(2003,2016)
months = range(1,13)
ensembles = range(1,12)

for year in years:
    print(year)
    print('----------')
    for idx, variable in enumerate(fields['name_CAFE']):
        print(variable)
        
        # First see if file already exists -----
        savename = 'cafe.fcst.v1.atmos.' + fields['name_std'][idx] + '.' + str(year) + '.clim.nc'
        try:
            xr.open_dataset(tmp_fldr + savename, autoclose=True)
        except:
            
            fcst_list = []
            for month in months:

                ens_list = []
                ens = []
                empty = True
                for ie, ensemble in enumerate(ensembles):

                    path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
                           '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'

                    # Try to stack ensembles into a list -----
                    try:
                        dataset = xr.open_mfdataset(path, autoclose=True)[variable]
                        
                        # Truncate to 2 year forecasts -----
                        n_trunc = min([731, len(dataset.time)])
                        dataset = dataset.isel(time=range(n_trunc))
                        
                        if 'latb' in dataset.dims:
                            dataset = dataset.rename({'latb':'lat_2','lonb':'lon_2'})
                        ens_list.append(dataset.rename(fields['name_std'][idx]))
                        ens.append(ie+1)
                        empty = False
                    except OSError:
                        # File does not exist -----
                        pass

                # Concatenate ensembles -----
                if empty == False:
                    ens_object = xr.concat(ens_list, dim='ensemble')
                    ens_object['ensemble'] = ens

                    # Stack concatenated ensembles into a list for each month in a year -----                       
                    fcst_list.append(ens_object)

            # Concatenate all months within year -----
            ds = xr.concat(fcst_list, dim='time')

            # Rechunk for chunksizes of at least 1,000,000 elements -----
            ds = utils.prune(ds.chunk(chunks={'ensemble' : len(ds.ensemble), 
                                              'time' : len(ds.time)}).squeeze())

            # Make month_day array of month-day -----
            m = np.array([str(i).zfill(2) + '-' for i in ds.time.dt.month.values])
            d = np.array([str(i).zfill(2)  for i in ds.time.dt.day.values])
            md = np.core.defchararray.add(m, d)

            # Replace time array with month_day array and groupby -----
            ds['time'] = md
            ds_clim = ds.groupby('time').mean(dim=['time','ensemble'],keep_attrs=True)

            # Fill time with presaved time -----
            ds_clim['time'] = time_use
            ds_clim.time.attrs['long_name'] = 'time'
            ds_clim.time.attrs['cartesian_axis'] = 'T'
            ds_clim.time.attrs['calendar_type'] = 'JULIAN'
            ds_clim.time.attrs['bounds'] = 'time_bounds'

            # Save and delete -----
            with utils.timer():
                ds_clim.to_netcdf(path=tmp_fldr + savename, mode = 'w',
                                  encoding = {'time':{'dtype':'float','calendar':'JULIAN',
                                                      'units':'days since 0001-01-01 00:00:00'}}) 

            del ds, ds_clim