# Check that data loading routine is correctly pulling and stacking data

#### Import pyLatte package

In [1]:
from pylatte import utils
from pylatte import skill

#### Currently, the following packages are required to load the data - this process will be replaced by the CAFE cookbook

In [2]:
import numpy as np
import pandas as pd
import xarray as xr

#### Import some plotting packages and widgets

In [3]:
import matplotlib.pyplot as plt
import warnings    
warnings.filterwarnings("ignore")

# Jupyter specific -----
from ipywidgets import FloatProgress
%matplotlib inline

# Construct xarray objects for forecasts and observations
(The CAFE cookbook should replace many of these code blocks)

In [5]:
# Initial dates to include (takes approximately 1 min 30 sec per date) -----
init_dates = pd.date_range('2003-1','2003-6' , freq='1MS')

# Ensembles to include -----
ensembles = range(1,12)

# Forecast length -----
FCST_LENGTH = 2 # years

# Temperature

In [None]:
# Location of forecast data -----
fcst_folder = '/OSM/CBR/OA_DCFP/data/model_output/CAFE/forecasts/v1/'
fcst_filename = 'atmos_daily*'
fcst_variable = 'temp'

# Location of observation data -----
obsv_folder = '/OSM/CBR/OA_DCFP/data/observations/jra55/isobaric/011_tmp/cat/'
obsv_filename = 'jra.55.tmp.1000.1958010100_2016123118.nc'
obsv_variable = 'TMP_GDS0_ISBL'

### Test forecasts 

In [50]:
# Instantiate progress bar -----
f = FloatProgress(min=0, max=len(init_dates)*len(ensembles), description='Loading...') 
display(f)

# Loop over initial dates -----
fcst_list = []
for init_date in init_dates:
    year = init_date.year
    month = init_date.month
    
    # Loop over ensembles -----
    ens_list = []
    for ensemble in ensembles:
        # Signal to increment the progress bar -----
        f.value += 1 
        
        # Stack ensembles into a list -----
        path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
               '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'
        dataset = xr.open_mfdataset(path, autoclose=True)[fcst_variable]
        ens_list.append(dataset.resample(time=resample_freq) \
                               .mean(dim='time'))
        
    # Concatenate ensembles -----
    ens_object = xr.concat(ens_list, dim='ensemble')
    ens_object['ensemble'] = ensembles
    
    # Stack concatenated ensembles into a list for each initial date -----                       
    fcst_list.append(utils.datetime_to_leadtime(ens_object))

# Keep track of the lead time for each initialization -----
n_lead_time = [len(x.lead_time) for x in fcst_list]

# Concatenate initial dates -----
da_fcst = xr.concat(fcst_list, dim='init_date')

# Rechunk for chunksizes of at least 1,000,000 elements -----
da_fcst = utils.prune(da_fcst.chunk(chunks={'ensemble' : len(da_fcst.ensemble), 
                                            'lead_time' : len(da_fcst.lead_time)}).squeeze())

# Truncate forecasts at FCST_LENGTH -----
max_increments = FCST_LENGTH * 12
n_trunc = max([i for i in n_lead_time if i <= max_increments])
da_fcst = da_fcst.isel(lead_time=range(n_trunc))

#### Reload data for random initial date and ensemble and compare to the stacked data

In [84]:
import random

date_int = random.randint(0,len(init_dates)-1)
ens_int = random.randint(0,len(ensembles)-1)

init_date_use = init_dates[date_int]
ensemble_use = ensembles[ens_int]

# Load and compare -----
year = init_date_use.year
month = init_date_use.month
ensemble = ensemble_use
print(f'Initial date: {month}, {year}')
print(f'Ensemble number: {ensemble}')

# Stack ensembles into a list -----
path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
       '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'

data_test = xr.open_mfdataset(path, autoclose=True)[fcst_variable]

fcst_A = da_fcst.isel(init_date=[date_int], ensemble = [ens_int]).squeeze()
fcst_B = data_test.resample(time='MS').mean(dim='time').isel(time=range(n_trunc))

print(f'Data are the same: {np.all(fcst_A.values == fcst_B.values)}')

Initial date: 3, 2003
Ensemble number: 2
Data are the same: True


### Test observations

In [85]:
# Instantiate progress bar -----
f = FloatProgress(min=0, max=1, description='Loading...') 
display(f)

# JRA temperature fields are only save in a time-concatenated form -----
path = obsv_folder + obsv_filename
dataset = xr.open_mfdataset(path, autoclose=True)[obsv_variable]
da_obsv = dataset.rename(fcst_variable) \
                 .rename({'initial_time0_hours' : 'time', 'g0_lon_3' : 'lon', 'g0_lat_2' : 'lat'}) \
                 .resample(time=resample_freq) \
                 .mean(dim='time')

# Stack by initial date to match forecast structure -----
da_obsv = utils.stack_by_init_date(da_obsv,da_fcst.init_date.values,n_trunc)
f.value += 1

# Average over forecast dimension if it is exists -----
if 'forecast_time1' in da_obsv.coords:
    da_obsv = da_obsv.mean(dim='forecast_time1')

# Rechunk for chunksizes of at least 1,000,000 elements -----
da_obsv = utils.prune(da_obsv.chunk(chunks={'init_date' : len(da_obsv.init_date)}).squeeze())

#### Reload data for random initial date and ensemble and compare to the stacked data

In [86]:
import random

date_int = random.randint(0,len(init_dates)-1)

init_date_use = init_dates[date_int]
print(f'Initial date: {init_date_use.month}, {init_date_use.year}')

data_test = dataset.rename(fcst_variable) \
                   .rename({'initial_time0_hours' : 'time', 'g0_lon_3' : 'lon', 'g0_lat_2' : 'lat'}) \
                   .resample(time='MS').mean(dim='time')

date_start = str(da_obsv.init_date[date_int].values)
date_end = str(utils.month_delta(date_start,23))

obsv_A = da_obsv.isel(init_date=[date_int]).squeeze()
obsv_B = data_test.sel(time = slice(date_start, date_end)).squeeze()

print(f'Data are the same: {np.all(obsv_A.values == obsv_B.values)}')

Initial date: 6, 2003
Data are the same: True


# Precipitation

In [89]:
# Location of forecast data -----
fcst_folder = '/OSM/CBR/OA_DCFP/data/model_output/CAFE/forecasts/v1/'
fcst_filename = 'atmos_daily*'
fcst_variable = 'precip'

# Location of observation data -----
obsv_folder = '/OSM/CBR/OA_DCFP/data/observations/jra55/isobaric/061_tprat/cat/'
obsv_filename = 'jra.55.tprat.000.1958010100_2016123121.nc'
obsv_variable = 'TPRAT_GDS0_SFC_ave3h'

### Test forecasts 

In [90]:
# Instantiate progress bar -----
f = FloatProgress(min=0, max=len(init_dates)*len(ensembles), description='Loading...') 
display(f)

# Loop over initial dates -----
fcst_list = []
for init_date in init_dates:
    year = init_date.year
    month = init_date.month
    
    # Loop over ensembles -----
    ens_list = []
    for ensemble in ensembles:
        # Signal to increment the progress bar -----
        f.value += 1 
        
        # Stack ensembles into a list -----
        path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
               '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'
        dataset = xr.open_mfdataset(path, autoclose=True)[fcst_variable]
        ens_list.append(dataset.resample(time=resample_freq) \
                               .sum(dim='time'))
        
    # Concatenate ensembles -----
    ens_object = xr.concat(ens_list, dim='ensemble')
    ens_object['ensemble'] = ensembles
    
    # Stack concatenated ensembles into a list for each initial date -----                       
    fcst_list.append(utils.datetime_to_leadtime(ens_object))

# Keep track of the lead time for each initialization -----
n_lead_time = [len(x.lead_time) for x in fcst_list]

# Concatenate initial dates -----
da_fcst = xr.concat(fcst_list, dim='init_date')

# Rechunk for chunksizes of at least 1,000,000 elements -----
da_fcst = utils.prune(da_fcst.chunk(chunks={'ensemble' : len(da_fcst.ensemble), 
                                            'lead_time' : len(da_fcst.lead_time)}).squeeze())

# Truncate forecasts at FCST_LENGTH -----
max_increments = FCST_LENGTH * 12
n_trunc = max([i for i in n_lead_time if i <= max_increments])
da_fcst = da_fcst.isel(lead_time=range(n_trunc))

#### Reload data for random initial date and ensemble and compare to the stacked data

In [96]:
import random

date_int = random.randint(0,len(init_dates)-1)
ens_int = random.randint(0,len(ensembles)-1)

init_date_use = init_dates[date_int]
ensemble_use = ensembles[ens_int]

# Load and compare -----
year = init_date_use.year
month = init_date_use.month
ensemble = ensemble_use
print(f'Initial date: {month}, {year}')
print(f'Ensemble number: {ensemble}')

# Stack ensembles into a list -----
path = fcst_folder + '/yr' + str(year) + '/mn' + str(month) + \
       '/OUTPUT.' + str(ensemble) + '/' + fcst_filename + '.nc'

data_test = xr.open_mfdataset(path, autoclose=True)[fcst_variable]

fcst_A = da_fcst.isel(init_date=[date_int], ensemble = [ens_int]).squeeze()
fcst_B = data_test.resample(time='MS').sum(dim='time').isel(time=range(n_trunc))

print(f'Data are the same: {np.all(fcst_A.values == fcst_B.values)}')

Initial date: 1, 2003
Ensemble number: 4
Data are the same: True


### Test observations

In [97]:
# Instantiate progress bar -----
f = FloatProgress(min=0, max=1, description='Loading...') 
display(f)

# JRA temperature fields are only save in a time-concatenated form -----
path = obsv_folder + obsv_filename
dataset = xr.open_mfdataset(path, autoclose=True)[obsv_variable]
da_obsv = dataset.rename(fcst_variable) \
                 .rename({'initial_time0_hours' : 'time', 'g0_lon_3' : 'lon', 'g0_lat_2' : 'lat'}) \
                 .resample(time=resample_freq) \
                 .sum(dim='time')

# Stack by initial date to match forecast structure -----
da_obsv = utils.stack_by_init_date(da_obsv,da_fcst.init_date.values,n_trunc)
f.value += 1

# Average over forecast dimension if it is exists -----
if 'forecast_time1' in da_obsv.coords:
    da_obsv = da_obsv.mean(dim='forecast_time1')

# Rechunk for chunksizes of at least 1,000,000 elements -----
da_obsv = utils.prune(da_obsv.chunk(chunks={'init_date' : len(da_obsv.init_date)}).squeeze())

#### Reload data for random initial date and ensemble and compare to the stacked data

In [105]:
import random

date_int = random.randint(0,len(init_dates)-1)

init_date_use = init_dates[date_int]
print(f'Initial date: {init_date_use.month}, {init_date_use.year}')

data_test = dataset.rename(fcst_variable) \
                   .rename({'initial_time0_hours' : 'time', 'g0_lon_3' : 'lon', 'g0_lat_2' : 'lat'}) \
                   .resample(time='MS').sum(dim='time')

date_start = str(da_obsv.init_date[date_int].values)
date_end = str(utils.month_delta(date_start,23))

obsv_A = da_obsv.isel(init_date=[date_int]).squeeze()
obsv_B = data_test.sel(time = slice(date_start, date_end)).mean(dim='forecast_time1').squeeze()

print(f'Data are the same: {np.all(obsv_A.values == obsv_B.values)}')

Initial date: 1, 2003
Data are the same: True
