# Calculate velocity potential

### This notebook is adapted from:

https://github.com/dougrichardson/Richardson_2022_coffee/blob/main/0e_calculate_velocity_potential.ipynb

In [1]:
from dask_jobqueue import PBSCluster
from dask.distributed import Client

In [2]:
# One node on Gadi has 48 cores - try and use up a full core before going to multiple nodes (jobs)

walltime = '00:10:00'
cores = 48
memory = str(cores * 4) + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory), processes=cores,
                     job_extra_directives=['-q normal',
                                           '-P w42',
                                           '-l ncpus='+str(cores),
                                           '-l mem='+str(memory),
                                           '-l storage=gdata/w42+gdata/rt52'],
                     local_directory='$TMPDIR',
                     job_directives_skip=["select"])

In [3]:
cluster.scale(jobs=1)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://10.6.21.35:8787/status,

0,1
Dashboard: http://10.6.21.35:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.21.35:45881,Workers: 0
Dashboard: http://10.6.21.35:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
import xarray as xr
import os
import numpy as np

In [6]:
years = range(2021, 2022)
levels = [150, 850]

# Daily u and v data - use standard dask and xarray tools

- To process hourly data to daily, for two isobaric levels, takes around 4 hours (using 3 full nodes; ~576 GB)

In [2]:
def get_files(file_path, var, years):
    """
    Get list of files
    """
    fp_list = []
    for year in years:
        fp_dir = file_path+var+'/'+str(year)+'/'
        for fp in sorted(os.listdir(fp_dir)):
            fp_list.append(fp_dir+fp)
    return fp_list

In [3]:
load = True

In [7]:
%%time
if load:
    u = xr.open_zarr('/g/data/w42/dr6273/work/data/era5/u/u_era5_daily_'+str(years[0])+'-'+str(years[-1])+'.zarr', consolidated=True)
    v = xr.open_zarr('/g/data/w42/dr6273/work/data/era5/v/v_era5_daily_'+str(years[0])+'-'+str(years[-1])+'.zarr', consolidated=True)
else:
    u_files = get_files('/g/data/rt52/era5/pressure-levels/reanalysis/', 'u', years)
    v_files = get_files('/g/data/rt52/era5/pressure-levels/reanalysis/', 'v', years)
    
    # Using preprocess in open_mfdataset to select desired levels improves performance
    #  versus doing a .sel() afterwards
    def preprocess(ds):
        return ds.sel(level=levels)
    
    u = xr.open_mfdataset(u_files,
                          chunks={'time': 24, 'level': 1},
                          preprocess=preprocess,
                          compat='override',
                          coords='minimal',
                          engine='netcdf4')

    v = xr.open_mfdataset(v_files,
                          chunks={'time': 24, 'level': 1},
                          compat='override',
                          preprocess=preprocess,
                          coords='minimal',
                          engine='netcdf4')
    
    u = u.resample(time='1D').mean()
    v = v.resample(time='1D').mean()
    
    u_encoding = {'u': {'dtype': 'float32'}}
    v_encoding = {'v': {'dtype': 'float32'}}
    
    u.to_zarr('/g/data/w42/dr6273/work/data/era5/u/u_era5_daily_'+str(years[0])+'-'+str(years[-1])+'.zarr',
              mode='w',
              consolidated=True,
              encoding=u_encoding)

    v.to_zarr('/g/data/w42/dr6273/work/data/era5/v/v_era5_daily_'+str(years[0])+'-'+str(years[-1])+'.zarr',
                mode='w',
                consolidated=True,
                encoding=v_encoding)
    
    # Close cluster
    client.close()
    cluster.close()

CPU times: user 351 ms, sys: 109 ms, total: 460 ms
Wall time: 729 ms


# Calculate velocity potential using `windspharm`

- Non-lazy, so we do this separately for each year and isobaric level
- Used 10 cores at 40GB

In [6]:
from windspharm.standard import VectorWind
from windspharm.tools import prep_data, recover_data, order_latdim

### For each level and year

~ Takes around 3 minutes per level and year
~ 2 levels, 42 years takes around 4.5 hours

In [11]:
def write_vpot(u, v, level, year):
    
    lons = u.longitude.values
    lats = u.latitude.values
    year = str(year)

    # Subsample u and v
    u_ = u.u.sel(time=year, level=level)
    v_ = v.v.sel(time=year, level=level)

    # Transpose to ensure time is out front
    u_ = u_.transpose('time', 'latitude', 'longitude')
    v_ = v_.transpose('time', 'latitude', 'longitude')

    # Load values
    uwnd = u_.values
    vwnd = v_.values

    # Ensure data is in correct shape for windspharm
    print('Prepping data for windspharm...')
    uwnd, uwnd_info = prep_data(uwnd, 'tyx') # 'tyx' because data is in format time, lat, lon
    vwnd, vwnd_info = prep_data(vwnd, 'tyx')
    lats, uwnd, vwnd = order_latdim(lats, uwnd, vwnd)

    # Create a VectorWind instance to handle computation of streamfunction and velocity potential
    print('Creating VectorWind instance...')
    w = VectorWind(uwnd, vwnd)

    # Calculate velocity potential
    print('Calculating VPOT...')
    _, vp = w.sfvp()

    # Re-shape to original format
    print('Reshaping...')
    vp = recover_data(vp, uwnd_info)

    # Put into DataArray and format for writing
    print('Putting into dataArray and writing...')
    vp = xr.DataArray(vp,
                     dims=['time', 'latitude', 'longitude'],
                     coords={'time': u_['time'].values,
                             'latitude': u_['latitude'].values,
                             'longitude': u_['longitude'].values})
    vp = vp.assign_attrs({'short_name': 'vpot',
                          'long name': 'velocity potential',
                          'units': 'm^2 / s^-1'})
    
    # Specify lat/lon units and transpose to time first so we can use cdo later
    vp['latitude'].attrs = {'units': 'degrees_north'}
    vp['longitude'].attrs = {'units': 'degrees_east'}
    vp = vp.expand_dims({'level': [level]})
    vp = vp.transpose('time', 'level', 'latitude', 'longitude')
    
    vp = vp.to_dataset(name='vpot')
    
    vp_encoding = {'vpot': {'dtype': 'float32'}}

    vp.to_netcdf('/g/data/w42/dr6273/work/data/era5/vpot/nc/vpot_'+str(level)+'_era5_daily_'+str(year)+'.nc',
                mode='w',
                encoding=vp_encoding)
    print()

In [12]:
%%time
for level in levels:
    print(level)
    for year in years:
        print(year)
        write_vpot(u, v, level, year)

150
2021
Prepping data for windspharm...
Creating VectorWind instance...
Calculating VPOT...
Reshaping...
Putting into dataArray and writing...

850
2021
Prepping data for windspharm...
Creating VectorWind instance...
Calculating VPOT...
Reshaping...
Putting into dataArray and writing...

CPU times: user 6min 21s, sys: 26.8 s, total: 6min 48s
Wall time: 6min 39s


# Regrid VPOT to 2x2

- We ended up using CDO from the command line. Trying to implement it from within the notebook didn't work.
- Previously tried using xesmf but that blew up and killed workers.

In [None]:
# # Create a text file called 2x2_grid.txt:

# gridtype = lonlat
# xsize    = 180
# ysize    = 91
# xfirst   = -180
# xinc     = 2
# yfirst   = -90
# yinc     = 2

In [83]:
# Then run the following commands, replacing infile with desired vpot array on ERA5 native grid
# !module load cdo
# !cdo remapbil,2x2_grid.txt infile.nc outfile.nc