In [1]:
######################################################################
# Filename:    preprocess_GEFSv12_reforecast.py
# Author:      Deanna Nash dnash@ucsd.edu
# Description: Script to take downloaded GEFSv12 reforecast u, v, and spfh data for each day, preprocess IVT data and save as single netCDF file
# https://registry.opendata.aws/noaa-gefs-reforecast/ (data link)
#
######################################################################

## import libraries
import os, sys
import yaml
import xarray as xr
import numpy as np

path_to_repo = '/cw3e/mead/projects/cwp140/scratch/dnash/repos/SEAK_AR_impacts/'
sys.path.append(path_to_repo+'modules')
import GEFSv12_funcs as gefs

path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'

config_file = 'config_1.yaml' # this is the config file name
job_info = 'job_154' # this is the job name

config = yaml.load(open(config_file), Loader=yaml.SafeLoader) # read the file
ddict = config[job_info] # pull the job info from the dict

year = ddict['year']
date = ddict['date']
variable = 'ivt' ## can be 'ivt', 'freezing_level', or 'prec'

for i, st in enumerate(range(72, 80, 8)):
    print(st, st+8)
    start = st
    stop = st+8
    
    if variable == 'ivt':
        print('Loading u, v, and q data ....')
        varname_lst = ['ugrd', 'vgrd', 'spfh']
        ds_lst = []
        for i, varname in enumerate(varname_lst):
            ds = gefs.read_and_regrid_prs_var(varname, date, year, start, stop)
            ds_lst.append(ds)
        


ERROR 1: PROJ: proj_create_from_database: Open of /cw3e/mead/projects/cwp140/scratch/dnash/miniconda3/envs/SEAK-impacts/share/proj failed


72 80
Loading u, v, and q data ....


In [6]:
import pandas as pd
from datetime import timedelta
import numpy as np
mon = 11
day = 17
## for each year between 2000 and 2019
date_lst = []
for i, yr in enumerate(range(2000, 2020)):
    ## get 45 days before date
    center_date = '{0}-{1}-{2}'.format(yr, mon, day)
    center_date = pd.to_datetime(center_date)
    start_date = center_date - timedelta(days=45)
    
    ## get 45 days after November 21
    end_date = center_date + timedelta(days=45)

    ## make a list of dates between start_date and end_date
    dates = pd.date_range(start_date, end_date, freq='1D')
    
    date_lst.append(dates)

final_lst = np.concatenate(date_lst)

array(['2000-10-03T00:00:00.000000000', '2000-10-04T00:00:00.000000000',
       '2000-10-05T00:00:00.000000000', ...,
       '2019-12-29T00:00:00.000000000', '2019-12-30T00:00:00.000000000',
       '2019-12-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [2]:
## load in surface pressure
print('Loading surface pressure data ....')
ds_pres = gefs.read_sfc_var('pres', date, year, start, stop)
ds_lst.append(ds_pres)

ds = xr.merge(ds_lst) # merge u, v, and q into single ds
ds = ds.sel(isobaricInhPa=slice(300, 1000))
ds = ds.reindex(isobaricInhPa=ds.isobaricInhPa[::-1])

ds

Loading surface pressure data ....


Unnamed: 0,Array,Chunk
Bytes,64 B,64 B
Shape,"(8,)","(8,)"
Dask graph,1 chunks in 25 graph layers,1 chunks in 25 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 64 B 64 B Shape (8,) (8,) Dask graph 1 chunks in 25 graph layers Data type datetime64[ns] numpy.ndarray",8  1,

Unnamed: 0,Array,Chunk
Bytes,64 B,64 B
Shape,"(8,)","(8,)"
Dask graph,1 chunks in 25 graph layers,1 chunks in 25 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,9.87 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 152, 152)"
Dask graph,120 chunks in 30 graph layers,120 chunks in 30 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 492.92 MiB 9.87 MiB Shape (5, 8, 12, 281, 479) (1, 8, 7, 152, 152) Dask graph 120 chunks in 30 graph layers Data type float64 numpy.ndarray",8  5  479  281  12,

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,9.87 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 152, 152)"
Dask graph,120 chunks in 30 graph layers,120 chunks in 30 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,9.87 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 152, 152)"
Dask graph,120 chunks in 30 graph layers,120 chunks in 30 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 492.92 MiB 9.87 MiB Shape (5, 8, 12, 281, 479) (1, 8, 7, 152, 152) Dask graph 120 chunks in 30 graph layers Data type float64 numpy.ndarray",8  5  479  281  12,

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,9.87 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 152, 152)"
Dask graph,120 chunks in 30 graph layers,120 chunks in 30 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,13.41 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 146, 215)"
Dask graph,60 chunks in 46 graph layers,60 chunks in 46 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 492.92 MiB 13.41 MiB Shape (5, 8, 12, 281, 479) (1, 8, 7, 146, 215) Dask graph 60 chunks in 46 graph layers Data type float64 numpy.ndarray",8  5  479  281  12,

Unnamed: 0,Array,Chunk
Bytes,492.92 MiB,13.41 MiB
Shape,"(5, 8, 12, 281, 479)","(1, 8, 7, 146, 215)"
Dask graph,60 chunks in 46 graph layers,60 chunks in 46 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.54 MiB,4.11 MiB
Shape,"(5, 8, 281, 479)","(1, 8, 281, 479)"
Dask graph,5 chunks in 22 graph layers,5 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20.54 MiB 4.11 MiB Shape (5, 8, 281, 479) (1, 8, 281, 479) Dask graph 5 chunks in 22 graph layers Data type float32 numpy.ndarray",5  1  479  281  8,

Unnamed: 0,Array,Chunk
Bytes,20.54 MiB,4.11 MiB
Shape,"(5, 8, 281, 479)","(1, 8, 281, 479)"
Dask graph,5 chunks in 22 graph layers,5 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [3]:

## mask values below surface pressure
print('Masking values below surface ....')
varlst = ['q', 'u', 'v']
for i, varname in enumerate(varlst):
    ds[varname] = ds[varname].where(ds[varname].isobaricInhPa < ds.sp/100., drop=False)

## integrate to calculate IVT
print('Calculating IVT ....')
ds_IVT = gefs.calc_IVT_manual(ds) # calculate IVT
ds_IVT

Masking values below surface ....
Calculating IVT ....


Unnamed: 0,Array,Chunk
Bytes,64 B,64 B
Shape,"(8,)","(8,)"
Dask graph,1 chunks in 31 graph layers,1 chunks in 31 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 64 B 64 B Shape (8,) (8,) Dask graph 1 chunks in 31 graph layers Data type datetime64[ns] numpy.ndarray",8  1,

Unnamed: 0,Array,Chunk
Bytes,64 B,64 B
Shape,"(8,)","(8,)"
Dask graph,1 chunks in 31 graph layers,1 chunks in 31 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 260 graph layers,120 chunks in 260 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 41.08 MiB 845.50 kiB Shape (5, 8, 281, 479) (1, 8, 89, 152) Dask graph 120 chunks in 260 graph layers Data type float64 numpy.ndarray",5  1  479  281  8,

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 260 graph layers,120 chunks in 260 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 260 graph layers,120 chunks in 260 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 41.08 MiB 845.50 kiB Shape (5, 8, 281, 479) (1, 8, 89, 152) Dask graph 120 chunks in 260 graph layers Data type float64 numpy.ndarray",5  1  479  281  8,

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 260 graph layers,120 chunks in 260 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 400 graph layers,120 chunks in 400 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 41.08 MiB 845.50 kiB Shape (5, 8, 281, 479) (1, 8, 89, 152) Dask graph 120 chunks in 400 graph layers Data type float64 numpy.ndarray",5  1  479  281  8,

Unnamed: 0,Array,Chunk
Bytes,41.08 MiB,845.50 kiB
Shape,"(5, 8, 281, 479)","(1, 8, 89, 152)"
Dask graph,120 chunks in 400 graph layers,120 chunks in 400 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [4]:
# get info for saving file
start = ds_IVT.step.values[0].astype('timedelta64[h]')
stop = ds_IVT.step.values[-1].astype('timedelta64[h]')
start = int(start / np.timedelta64(1, 'h'))
stop = int(stop / np.timedelta64(1, 'h'))

## save IVT data to netCDF file
print('Writing {0} to netCDF ....'.format(date))
out_fname = path_to_data + 'preprocessed/GEFSv12_reforecast/ivt/{0}_ivt_F{1}_F{2}.nc'.format(date, start, stop) 
ds_IVT.load().to_netcdf(path=out_fname, mode = 'w', format='NETCDF4')

Writing 20011118 to netCDF ....


In [1]:
## import libraries
import os, sys
import yaml
import xarray as xr
import numpy as np

path_to_repo = '/cw3e/mead/projects/cwp140/scratch/dnash/repos/SEAK_AR_impacts/'
sys.path.append(path_to_repo+'modules')
import GEFSv12_funcs as gefs

path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'

config_file = 'config_1.yaml' # this is the config file name
job_info = 'job_154' # this is the job name

config = yaml.load(open(config_file), Loader=yaml.SafeLoader) # read the file
ddict = config[job_info] # pull the job info from the dict

year = ddict['year']
date = ddict['date']
variable = 'ivt' ## can be 'ivt', 'freezing_level', or 'prec'

ERROR 1: PROJ: proj_create_from_database: Open of /cw3e/mead/projects/cwp140/scratch/dnash/miniconda3/envs/SEAK-impacts/share/proj failed


In [2]:
import glob
varname_lst = ['ugrd', 'vgrd', 'spfh']
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/{0}/'.format(date) 
    
# read data below 700 mb - 0.25 degree
fname_lst = glob.glob(path_to_data+"{0}_pres_abv700mb_{1}00_*.grib2".format(varname_lst[2], date))
print(fname_lst)

# fname_lst = glob.glob(path_to_data+"{0}_pres_{1}00*.grib2".format(varname_lst[2], date))

['/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/20011118/spfh_pres_abv700mb_2001111800_c00.grib2', '/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/20011118/spfh_pres_abv700mb_2001111800_p04.grib2', '/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/20011118/spfh_pres_abv700mb_2001111800_p03.grib2', '/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/20011118/spfh_pres_abv700mb_2001111800_p02.grib2', '/cw3e/mead/projects/cwp140/scratch/dnash/data/downloads/GEFSv12_reforecast/20011118/spfh_pres_abv700mb_2001111800_p01.grib2']


In [41]:
def preprocess(ds, start, stop):
    '''keep only the first 24 hours'''
    return ds.isel(step=slice(start, stop))
    
def fix_GEFSv12_open_mfdataset(fname, start, stop):
    list_of_files = glob.glob(fname)
    ds_lst = []
    for i, fi in enumerate(list_of_files):
        ds = xr.open_dataset(fi)
        if ds['time'].size > 1:
            ds = ds.isel(time=0)
        
        ds_lst.append(ds)

    ## get max step size
    step_size_lst = []
    for i, ds in enumerate(ds_lst):
        step_size_lst.append(ds.step.size)
    max_size = max(step_size_lst)
    max_index = step_size_lst.index(max(step_size_lst))
    max_time = ds_lst[max_index].valid_time.values
    max_ds = ds_lst[max_index]
    ## now loop through and fill ds where smaller than max size
    new_ds_lst = []
    for i, tmp in enumerate(ds_lst):
        if tmp.step.size < max_size:
            new_ds = tmp.reindex_like(max_ds, method='nearest', fill_value=np.nan)
            # new_ds = new_ds.drop_dims("valid_time")
            new_ds = new_ds.assign_coords(valid_time=("step", max_time))
            new_ds = preprocess(new_ds, start, stop)
            # new_ds = ds.expand_dims("valid_time").assign_coords(valid_time=max_time)
            # new_ds = ds.update({"valid_time": max_time})
            # ds1, new_ds = xr.align(ds_above[max_index], ds, join="left")
            new_ds_lst.append(new_ds)
    
        elif ds.step.size == max_size:
            ds = preprocess(ds, start, stop)
            new_ds_lst.append(ds)
        
    ds = xr.concat(new_ds_lst, dim="number")
    
    return ds

In [42]:
ds_above = fix_GEFSv12_open_mfdataset_test(fname_lst, 72, 80)
ds_above 