In [1]:
import xarray as xr
from netCDF4 import Dataset
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import load_data
import wrf

In [None]:
#
# read in the 0, 5, 10, and 15 minute reflectivity data
#
# Separate out the test data so no shuffling is done
#   - for 2012-2015 data files: [3672:] is 2015
#
# Reflectivity
# scale the data using the min and 99.9 percentile data
#    - refl = (refl - scale_min) / (scale_99p9 - scale_min)
# Write the Reflectivity data for each offset
#    - training data includes the times, and the shuffle sequence
#    - test data includes the times.
#

refl_data_main_path = '/glade/work/hardt/ds612' 

In [None]:
refl_files = ['model2_2012-2015_00minuteAfterHour_3D_refl.nc', 
              'model2_2012-2015_05minuteAfterHour_3D_refl.nc', 
              'model2_2012-2015_10minuteAfterHour_3D_refl.nc', 
              'model2_2012-2015_15minuteAfterHour_3D_refl.nc']

test_output = ['model2_00minuteAfterHour_3D_refl_scaled-test2.nc', 
               'model2_05minuteAfterHour_3D_refl_scaled-test2.nc', 
               'model2_10minuteAfterHour_3D_refl_scaled-test2.nc', 
               'model2_15minuteAfterHour_3D_refl_scaled-test2.nc']

train_output = ['model2_00minuteAfterHour_3D_refl_shuffled_scaled-train2.nc', 
                'model2_05minuteAfterHour_3D_refl_shuffled_scaled-train2.nc', 
                'model2_10minuteAfterHour_3D_refl_shuffled_scaled-train2.nc', 
                'model2_15minuteAfterHour_3D_refl_shuffled_scaled-train2.nc']


In [None]:
for ifile, test_out, train_out in zip(refl_files, test_output, train_output):
    #
    refl_ds = xr.open_dataset( os.path.join(refl_data_main_path, ifile) )
    #
    print("Loading REFL_10CM data from", ifile)
    #
    refl_train = refl_ds.REFL_10CM[:3671,:,:,:].values
    refl_train_t = refl_ds.XTIME[:3671].values
    #
    refl_test = refl_ds.REFL_10CM[3672:,:,:,:].values
    refl_test_t = refl_ds.XTIME[3672:].values
    #
    # create the random shuffle indexes with first dataset
    #
    try:
        s
    except NameError:
        print("Defining shuffle sequence")
        s = np.arange(refl_train.shape[0])
        np.random.shuffle(s)
    else:
        print("Suffle sequence already defined")
    #
    refl_train = refl_train[s]
    refl_train_t = refl_train_t[s]
    #
    # save 2D composite for W
    #
    try:
        refl_train_2d
    except NameError:
        print('Defining 2D refectivity fields')
        refl_train_2d = refl_train[:,:,:,:].max(axis=1)
        refl_test_2d = refl_test[:,:,:,:].max(axis=1)
    else:
        print("2D reflectivity data already defined")
    #
    train_scale_min = np.amin(refl_train)
    train_scale_99p9 = np.percentile(refl_train, 99.9)
    #
    #test_scale_min = np.amin(refl_test)
    #test_scale_99p9 = np.percentile(refl_test, 99.9)
    #
    print('refl_train.shape:',refl_train.shape)
    print("Doing min/max scaling")
    #
    refl_train = (refl_train - train_scale_min) / (train_scale_99p9 - train_scale_min)
    #
    # using same scale on test data
    #
    refl_test = (refl_test - train_scale_min) / (train_scale_99p9 - train_scale_min)
    #
    # Write netcdf output. 
    # Would like to re-write this using the netcdf4 module.
    # adding in attributes so I can store the scaling information
    # and also add in the XLONG, XLAT fields.
    #
    train_output_data = os.path.join(refl_data_main_path, train_out)
    test_output_data  = os.path.join(refl_data_main_path, test_out)
    #
    # Write the training data
    #
    REFL_OUT = xr.DataArray(data=refl_train, 
                            name='REFL_10CM',
                            dims=['time', 'bottom_top', 'south_north','west_east'],
                            attrs=dict(
                                description='reflectivity',
                                units='dBZ',
                                scale_min=train_scale_min,
                                scale_99p9=train_scale_99p9,
                            ),
                          )

    encoding={'REFL_10CM': {'zlib': True, '_FillValue': -99.0}}
    REFL_XTIME = xr.DataArray(refl_train_t, name='XTIME')
    SHUFFLE = xr.DataArray(s, name='shuffle_seq')
    #
    print("Writing REFL_10CM training data to", train_output_data)
    #
    REFL_XTIME.to_netcdf(train_output_data)
    SHUFFLE.to_netcdf(train_output_data, mode='a')
    REFL_OUT.to_netcdf(train_output_data, encoding=encoding, mode='a')
    #
    del refl_train
    del refl_train_t
    del REFL_OUT
    del REFL_XTIME
    del SHUFFLE
    #
    # Write the test data
    #
    REFL_OUT = xr.DataArray(data=refl_test, 
                            name='REFL_10CM',
                            dims=['time', 'bottom_top', 'south_north','west_east'],
                            attrs=dict(
                                description='reflectivity',
                                units='dBZ',
                                scale_min=train_scale_min,
                                scale_99p9=train_scale_99p9,
                            ),
                          )
    encoding={'REFL_10CM': {'zlib': True, '_FillValue': -99.0}}
    REFL_XTIME = xr.DataArray(refl_test_t, name='XTIME')
    #
    print("Writing REFL_10CM test data to", test_output_data)
    REFL_XTIME.to_netcdf(test_output_data)
    REFL_OUT.to_netcdf(test_output_data, encoding=encoding, mode='a')
    #
    del refl_test
    del refl_test_t
    del REFL_OUT
    del REFL_XTIME
    #
    print('DONE writing REFL_OUT from input', ifile)
    print()

In [2]:
refl_ds = xr.open_dataset('/glade/work/hardt/ds612/model2_00minuteAfterHour_3D_refl_shuffled_scaled-train2.nc')
s = refl_ds.shuffle_seq.values
print(s.shape)

(3671,)


In [3]:
#
# read in the W data
#
W_data_main_path = '/glade/work/hardt/ds612'
W_ds = xr.open_dataset(os.path.join(W_data_main_path, "model2_2012-2015_3D_W.nc"))
#

In [4]:
#
# NEW W scaling approach
#
print("Reading time array for W.")
#
W_train_t = W_ds.XTIME[:3671].values
W_test_t = W_ds.XTIME[3672:].values
#
# start values determined in refl_ds.attrs history
# ncks -O -dwest_east,560,1320 -dsouth_north,130,955
#
print('Loading W data.')
#
# Full 3D
#
print("Reading in W 3D")
#
W_train = W_ds.W[:3671,:,:,:].values.max(axis=1)
W_test = W_ds.W[3672:,:,:,:].values.max(axis=1)
#
# shuffle the same as the reflectivity
#
print("Shuffling W_train")
W_train = W_train[s]
W_train_t = W_train_t[s]
#
train_scale_min  = np.amin(W_train)
train_scale_99p0 = np.percentile(W_train,99.0)
#
print("Doing min/percentile scaling on W")
#
W_train = (W_train - train_scale_min) / (train_scale_99p0 - train_scale_min)
W_test = (W_test - train_scale_min) / (train_scale_99p0 - train_scale_min)
#

Reading time array for W.
Loading W data.
Reading in W 3D
Shuffling W_train
Doing min/percentile scaling on W


In [5]:
train_output_data = '/glade/work/hardt/ds612/model2_composite_W_shuffled_scaled-train2.nc'
test_output_data = '/glade/work/hardt/ds612/model2_composite_W_scaled-test2.nc'
#
W_XTIME = xr.DataArray(W_train_t, name='XTIME')
SHUFFLE = xr.DataArray(s, name='shuffle_seq')
W_OUT = xr.DataArray(data=W_train, 
                     name='W',
                     dims=['time','south_north','west_east'],
#                     dims=['time','bottom_top','south_north','west_east'],
                     attrs=dict(
                         description='W',
                         units='scaled',
                         scale_min=train_scale_min,
                         scale_99p0=train_scale_99p0,
                     ),
                    )
encoding={'W': {'zlib': True, '_FillValue': -999.0}}
#
print("Writing W data to", train_output_data)
#
W_XTIME.to_netcdf(train_output_data)
SHUFFLE.to_netcdf(train_output_data, mode='a')
W_OUT.to_netcdf(train_output_data, encoding=encoding, mode='a')
#
del W_train
del W_OUT
del W_XTIME
del SHUFFLE
del W_ds
#
W_XTIME = xr.DataArray(W_test_t, name='XTIME')
W_OUT = xr.DataArray(data=W_test, 
                     name='W',
                     dims=['time','south_north','west_east'],
#                     dims=['time','bottom_top','south_north','west_east'],
                     attrs=dict(
                         description='W',
                         units='scaled',
                         scale_min=train_scale_min,
                         scale_99p0=train_scale_99p0,
                     ),
                    )
encoding={'W': {'zlib': True, '_FillValue': -999.0}}
#
print("Writing W data to", test_output_data)
#
W_XTIME.to_netcdf(test_output_data)
W_OUT.to_netcdf(test_output_data, encoding=encoding, mode='a')
#
del W_test
del W_OUT
del W_XTIME
#

print("Done")

Writing W data to /glade/work/hardt/ds612/model2_composite_W_shuffled_scaled-train2.nc
Writing W data to /glade/work/hardt/ds612/model2_composite_W_scaled-test2.nc
Done
