In [1]:
# covnert netcdf hourly precip to gssha gag format
# reference: https://www.gsshawiki.com/Utility_Programs:Format_Precip_Spreadsheet
# reference: Format Precip Macro Download (May 2009 version)
# aggregreate hourly pcp to 3-hourly, and remove discontinous rainfall.  

import numpy as np
import os
import pandas as pd
import xarray as xr
import datetime
from pyproj import Proj

root_dir = '/glade/u/home/hongli/scratch/2019_10_01gssha/ens_forc_wrf2/scripts'
ens_dir = os.path.join(root_dir, 'step11_downscale_daily2hr')
gag_tpl_file = os.path.join(root_dir,'PRECIP_wwrf_2017_2018_1day_lead__no_event_24hour.gag')

agg_acc_perd = 3 # firstly, accumulated precp time interval in final precp output (unit: hour)
separate_perd = 12 # secondly, identify un-reported events that do not have percp for separate_perd continuous hours (unit: hour)
separate_steps = separate_perd/agg_acc_perd # min row number for no precp events

output_dir=os.path.join(root_dir, 'step14_format_prcp_to_gag')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

test_folders = [d for d in os.listdir(ens_dir)]
test_folders = sorted(test_folders)
ens_num = 100

# ----------------- Coordinate configuration -----------------
# read COORD from gag template file
# COORD: UTM coordinates in the format easting northing.
COORDs = []
with open(gag_tpl_file) as f:
    for i,line in enumerate(f):
        if i>=3 and i<=420:
            COORDs.append(line.rstrip())
        elif i>420:
            break          

# convert UTM coordinates to WGS84 to see these grids locations
xllcorner = [float(x.split(' ')[1]) for x in COORDs]
yllcorner = [float(x.split(' ')[2]) for x in COORDs]

p = Proj(proj='utm',zone=10,ellps='WGS84', preserve_units=False)
lon, lat = p(xllcorner, yllcorner, inverse=True)
coord_gag = [[i,lat[i],lon[i]] for i in range(len(lon))]
np.savetxt(os.path.join(output_dir,'gag_grid_coord_wgs84.txt'),coord_gag,fmt='%d,%f,%f',header='ID,longitude,latitude')
# they are west-to-east and north-to-south. 

# GMET output netcdf coordinates
ens_file = 'ens_forc.001.nc'
f=xr.open_dataset(os.path.join(ens_dir,test_folders[0],ens_file))
lat_gmet = f['lat'].values[:]
lon_gmet = f['lon'].values[:]
(ny,nx) = np.shape(lat_gmet)

coord_gmet=[]
for i in range(ny):
    for j in range(nx):
        coord_gmet.append([lat_gmet[i,j],lon_gmet[i,j]])
# they are west-to-east and south-to-north. The first value of gag corresponds to netcdf pcp[-1,0]. 
# Therefore, need to flipud pcp below to write gag correctly.
# ---------------------------------------------------

# format
# loop through all tests
for test_folder in test_folders[0:1]:    
    print(test_folder)
    
    for m in range(ens_num):
#     for m in range(1):
#         print(str('%03d' % (m+1)))

        mb_folder = 'mb'+ str('%03d' % (m+1))
        if not os.path.exists(os.path.join(output_dir,test_folder)):
            os.makedirs(os.path.join(output_dir,test_folder))

        # --- part 1. read ensemble --- 
#         print('--- part 1. read ensemble --- ')
        ens_file = 'ens_forc.'+ str('%03d' % (m+1)) +'.nc'
        f=xr.open_dataset(os.path.join(ens_dir,test_folder,ens_file))
        time = pd.DatetimeIndex(f['time'][:].dt.floor('H').to_pandas())
        pcp = f.variables['Prcp'].values[:] # mm/hr        
        
        # --- part 2. pre-process netcdf pcp ---
#         print('--- part 2. pre-process netcdf pcp ---')        
        # remove the first few time steps until it starts at hours 1,4,7,10,13,16,19,22
        # This is to avoid incomplete 3hrs data aggregation.
        i = 0
        start_hr_ls = [1,4,7,10,13,16,19,22]
        while not (time[i].hour+1 in start_hr_ls): # here hour starts at 0
            i = i+1
        if i != 0:
            pcp = pcp[i:,:,:]
            time = time[i:]

        # reshape pcp ( invert lat and lon dims, and 3D -> 2D)
        pcp = np.flip(pcp, 1) # flip along axis=1 (latitude axis small-to-large -> large-to-small)
        (nt,ny,nx) = np.shape(pcp) # (time,lat,lon)
        pcp_new = np.reshape(pcp,(nt,ny*nx))

        # define grid names for dataframe column
        grid_names=[]
        for j in range(ny):
            for k in range(nx):
                grid_names.append('row'+str(j)+'col'+str(k))

        # create dataframe (time, grids)
        df = pd.DataFrame(pcp_new,columns=grid_names)
        df['datetime'] = time
        df = df.set_index('datetime')
        
        # aggreagte precip into specified time intervals
        df_3hr = df.resample('3H').sum() 
        time_3hr = pd.to_datetime(df_3hr.index)
#         df_3hr.index = df_3hr.index + pd.DateOffset(hours=1) #(start at 0 -> start at 1)
        
        # --- part 3. separate rainfall ---
#         print('--- part 3. separate rainfall --- ')
        (num_time,num_grid)=np.shape(df_3hr)
        delete_rows = [] # start row #, end row #
        
        i = 0
        while (i<num_time):

            if all(df_3hr.iloc[i, :]==0):
                
                j      = 0 # cumulative number of no-rainfall time steps
                period = 0 # cumulative period length (hr)
                sign   = 0 # if 1, delete these rows.
                
                # find all its following no-precip time steps
                while (all(df_3hr.iloc[i, :]==0)):
                    j = j+1
                    period = period+j*agg_acc_perd                    
                    i = i+1
                    
                    # record if no-rainfall length is beyond the separate_perd
                    if period>=separate_perd:
                        sign = 1
                        
                    # remove all the last time steps with no-rainfall (no mater length of period)
                    # and stop loop
                    if i == num_time-1:
                        delete_rows.append([i-j+1,i])
                        break                                         
                    
                # decide whether delete these time steps
                if sign == 1:
                    delete_rows.append([i-j,i]) # exlude the last, only work as index bound.
            i = i+1

        # --- part 4. write event ---
#         print('--- part 4. write event --- ')
        gag_ofile = 'ens_forc.'+ str('%03d' % (m+1)) +'.gag'
        if os.path.exists(os.path.join(output_dir,test_folder,gag_ofile)):
            os.remove(os.path.join(output_dir,test_folder,gag_ofile))                   

        f_out=open(os.path.join(output_dir,test_folder,gag_ofile),'w')
        
        num_norain_perds = len(delete_rows)
#         print(('event num = %d')%(num_norain_perds+1))
        for i in range(num_norain_perds):
            
            EVENT = "Event "+str(i+1)
            NRGAG = 418
#             print(EVENT)
            
            # identify the start and end row numbers of a rainfall event
            if i == 0:
                start_i = 0
                end_i = delete_rows[i][0]
            elif i<num_norain_perds-1:
                start_i = delete_rows[i][1]
                end_i = delete_rows[i+1][0]
            elif i==num_norain_perds-1:
                if (delete_rows[i][1]-delete_rows[i][0])>=separate_steps:
                    start_i = delete_rows[i][1]
                    end_i = len(df_3hr)
            NRPDS = end_i-start_i
             
            # write heads
            f_out.write(("EVENT %s\n") %(EVENT))
            f_out.write(("NRPDS %d\n") %(NRPDS))
            f_out.write(("NRGAG %d\n") %(NRGAG))
            
            # write coordinates
            for j in range(len(COORDs)):
                f_out.write(("%s\n") %(COORDs[j]))
            
            # write precip
            for j in range(start_i,end_i):
                t=df_3hr.index[j]
                f_out.write(("GAGES %04d %02d %02d %02d 00 ") %(t.year,t.month,t.day,t.hour+1)) # start at 1:00, not 0:00
                
                data = df_3hr.iloc[j,:]
                data_scientific = [np.format_float_scientific(x,unique=True,trim='0',precision=7) for x in data]                 
                data_scientific =  [sub.replace('0.0e+00', '0.0') for sub in data_scientific]
                for k in range(NRGAG):
                    f_out.write(("%s ") %(data_scientific[k]))
                f_out.write("\n")
                
            f_out.write("\n") # end loop steps of one event                
        f_out.close() # end loop events

print('Done')

046grids
Done


In [2]:
import numpy as np
import os
import pandas as pd
import xarray as xr
import datetime
from pyproj import Proj

root_dir = '/glade/u/home/hongli/scratch/2019_10_01gssha/ens_forc_wrf2/scripts'
gag_tpl_file = os.path.join(root_dir,'PRECIP_wwrf_2017_2018_1day_lead__no_event_24hour.gag')
nc_file = os.path.join(ens_dir,test_folders[0],ens_file)
gag_ofile = 

agg_acc_perd = 3 # firstly, accumulated precp time interval in final precp output (unit: hour)
separate_perd = 12 # secondly, identify un-reported events that do not have percp for separate_perd continuous hours (unit: hour)
separate_steps = separate_perd/agg_acc_perd # min row number for no precp events

# ----------------- Coordinate configuration -----------------
# read COORD from gag template file
# COORD: UTM coordinates in the format easting northing.
COORDs = []
with open(gag_tpl_file) as f:
    for i,line in enumerate(f):
        if i>=3 and i<=420:
            COORDs.append(line.rstrip())
        elif i>420:
            break          

# convert UTM coordinates to WGS84 to see these grids locations
xllcorner = [float(x.split(' ')[1]) for x in COORDs]
yllcorner = [float(x.split(' ')[2]) for x in COORDs]

p = Proj(proj='utm',zone=10,ellps='WGS84', preserve_units=False)
lon, lat = p(xllcorner, yllcorner, inverse=True)
coord_gag = [[i,lat[i],lon[i]] for i in range(len(lon))]
np.savetxt(os.path.join(output_dir,'gag_grid_coord_wgs84.txt'),coord_gag,fmt='%d,%f,%f',header='ID,longitude,latitude')
# they are west-to-east and north-to-south. 

# GMET output netcdf coordinates
f=xr.open_dataset(nc_file)
lat_gmet = f['lat'].values[:]
lon_gmet = f['lon'].values[:]
(ny,nx) = np.shape(lat_gmet)

coord_gmet=[]
for i in range(ny):
    for j in range(nx):
        coord_gmet.append([lat_gmet[i,j],lon_gmet[i,j]])
# they are west-to-east and south-to-north. The first value of gag corresponds to netcdf pcp[-1,0]. 
# Therefore, need to flipud pcp below to write gag correctly.
# ---------------------------------------------------

# format
# --- part 1. read ensemble --- 
ens_file = 'ens_forc.'+ str('%03d' % (m+1)) +'.nc'
f=xr.open_dataset(nc_file)
time = pd.DatetimeIndex(f['time'][:].dt.floor('H').to_pandas())
pcp = f.variables['Prcp'].values[:] # mm/hr        

# --- part 2. pre-process netcdf pcp ---
# remove the first few time steps until it starts at hours 1,4,7,10,13,16,19,22
# This is to avoid incomplete 3hrs data aggregation.
i = 0
start_hr_ls = [1,4,7,10,13,16,19,22]
while not (time[i].hour+1 in start_hr_ls): # here hour starts at 0
    i = i+1
if i != 0:
    pcp = pcp[i:,:,:]
    time = time[i:]

# reshape pcp ( invert lat and lon dims, and 3D -> 2D)
pcp = np.flip(pcp, 1) # flip along axis=1 (latitude axis small-to-large -> large-to-small)
(nt,ny,nx) = np.shape(pcp) # (time,lat,lon)
pcp_new = np.reshape(pcp,(nt,ny*nx))

# define grid names for dataframe column
grid_names=[]
for j in range(ny):
    for k in range(nx):
        grid_names.append('row'+str(j)+'col'+str(k))

# create dataframe (time, grids)
df = pd.DataFrame(pcp_new,columns=grid_names)
df['datetime'] = time
df = df.set_index('datetime')

# aggreagte precip into specified time intervals
df_3hr = df.resample('3H').sum() 
time_3hr = pd.to_datetime(df_3hr.index)

# --- part 3. separate rainfall ---
(num_time,num_grid)=np.shape(df_3hr)
delete_rows = [] # start row #, end row #

i = 0
while (i<num_time):

    if all(df_3hr.iloc[i, :]==0):

        j      = 0 # cumulative number of no-rainfall time steps
        period = 0 # cumulative period length (hr)
        sign   = 0 # if 1, delete these rows.

        # find all its following no-precip time steps
        while (all(df_3hr.iloc[i, :]==0)):
            j = j+1
            period = period+j*agg_acc_perd                    
            i = i+1

            # record if no-rainfall length is beyond the separate_perd
            if period>=separate_perd:
                sign = 1

            # remove all the last time steps with no-rainfall (no mater length of period)
            # and stop loop
            if i == num_time-1:
                delete_rows.append([i-j+1,i])
                break                                         

        # decide whether delete these time steps
        if sign == 1:
            delete_rows.append([i-j,i]) # exlude the last, only work as index bound.
    i = i+1

# --- part 4. write event ---
f_out=open(gag_ofile,'w')
num_norain_perds = len(delete_rows)
for i in range(num_norain_perds):

    EVENT = "Event "+str(i+1)
    NRGAG = 418

    # identify the start and end row numbers of a rainfall event
    if i == 0:
        start_i = 0
        end_i = delete_rows[i][0]
    elif i<num_norain_perds-1:
        start_i = delete_rows[i][1]
        end_i = delete_rows[i+1][0]
    elif i==num_norain_perds-1:
        if (delete_rows[i][1]-delete_rows[i][0])>=separate_steps:
            start_i = delete_rows[i][1]
            end_i = len(df_3hr)
    NRPDS = end_i-start_i

    # write heads
    f_out.write(("EVENT %s\n") %(EVENT))
    f_out.write(("NRPDS %d\n") %(NRPDS))
    f_out.write(("NRGAG %d\n") %(NRGAG))

    # write coordinates
    for j in range(len(COORDs)):
        f_out.write(("%s\n") %(COORDs[j]))

    # write precip
    for j in range(start_i,end_i):
        t=df_3hr.index[j]
        f_out.write(("GAGES %04d %02d %02d %02d 00 ") %(t.year,t.month,t.day,t.hour+1)) # start at 1:00, not 0:00

        data = df_3hr.iloc[j,:]
        data_scientific = [np.format_float_scientific(x,unique=True,trim='0',precision=7) for x in data]                 
        data_scientific =  [sub.replace('0.0e+00', '0.0') for sub in data_scientific]
        for k in range(NRGAG):
            f_out.write(("%s ") %(data_scientific[k]))
        f_out.write("\n")

    f_out.write("\n") # end loop steps of one event                
f_out.close() # end loop events

(3047, 22, 19)

In [8]:
(nt,ny,nx) = np.shape(pcp)
nt,ny,nx
pcp_new = pcp.reshape((3047,22*19))

AttributeError: 'Variable' object has no attribute 'reshape'

In [9]:
pcp

AttributeError: 'Variable' object has no attribute 'variable'

<xarray.Variable (time: 3047, y: 22, x: 19)>
array([[[0.000000e+00, 6.529223e-07, ..., 0.000000e+00, 0.000000e+00],
        [0.000000e+00, 3.712513e-05, ..., 0.000000e+00, 0.000000e+00],
        ...,
        [0.000000e+00, 0.000000e+00, ..., 0.000000e+00, 0.000000e+00],
        [0.000000e+00, 0.000000e+00, ..., 0.000000e+00, 0.000000e+00]],

       [[0.000000e+00, 9.793833e-07, ..., 0.000000e+00, 0.000000e+00],
        [0.000000e+00, 5.568769e-05, ..., 0.000000e+00, 0.000000e+00],
        ...,
        [0.000000e+00, 0.000000e+00, ..., 0.000000e+00, 0.000000e+00],
        [0.000000e+00, 0.000000e+00, ..., 0.000000e+00, 0.000000e+00]],

       ...,

       [[0.000000e+00, 6.486446e-01, ..., 4.934439e-02, 0.000000e+00],
        [0.000000e+00, 8.577239e-01, ..., 3.664310e-02, 4.787157e-02],
        ...,
        [0.000000e+00, 0.000000e+00, ..., 4.986373e-03, 0.000000e+00],
        [0.000000e+00, 0.000000e+00, ..., 2.089852e-02, 0.000000e+00]],

       [[0.000000e+00, 2.144245e-03, ..., 4.9

In [31]:
time_3hr

DatetimeIndex(['2017-12-01 18:00:00', '2017-12-01 21:00:00',
               '2017-12-02 00:00:00', '2017-12-02 03:00:00',
               '2017-12-02 06:00:00', '2017-12-02 09:00:00',
               '2017-12-02 12:00:00', '2017-12-02 15:00:00',
               '2017-12-02 18:00:00', '2017-12-02 21:00:00',
               ...
               '2018-04-07 00:00:00', '2018-04-07 03:00:00',
               '2018-04-07 06:00:00', '2018-04-07 09:00:00',
               '2018-04-07 12:00:00', '2018-04-07 15:00:00',
               '2018-04-07 18:00:00', '2018-04-07 21:00:00',
               '2018-04-08 00:00:00', '2018-04-08 03:00:00'],
              dtype='datetime64[ns]', name='datetime', length=1020, freq='3H')